From 3fd53533a8bcc5a7f1fa275e28dfb6b05f28a941 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 13 Feb 2020 12:11:26 +0800 Subject: dm crypt: use crypt_integrity_aead() helper Replace test_bit(CRYPT_MODE_INTEGRITY_AEAD, XXX) with crypt_integrity_aead(). Signed-off-by: Yang Yingliang Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c6a529873d0f..3df90daba89e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -230,6 +230,8 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, struct scatterlist *sg); +static bool crypt_integrity_aead(struct crypt_config *cc); + /* * Use this to access cipher attributes that are independent of the key. */ @@ -346,7 +348,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, unsigned bs; int log; - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + if (crypt_integrity_aead(cc)) bs = crypto_aead_blocksize(any_tfm_aead(cc)); else bs = crypto_skcipher_blocksize(any_tfm(cc)); @@ -712,7 +714,7 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) { + if (crypt_integrity_aead(cc)) { ti->error = "AEAD transforms not supported for EBOIV"; return -EINVAL; } -- cgit v1.2.3 From eaab4bde6e645a37febba9a4126dc71e5994e3a1 Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 12 Feb 2020 11:43:10 +0100 Subject: dm integrity: print device name in integrity_metadata() error message Similar to f710126cfc89c8df477002a26dee8407eb0b4acd ("dm crypt: print device name in integrity error message"), this message should also better identify the device with the integrity failure. Signed-off-by: Erich Eckner Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2f03fecd312d..66b8cfb83087 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1558,7 +1558,8 @@ again: checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - DMERR_LIMIT("Checksum failed at sector 0x%llx", + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b), (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; atomic64_inc(&ic->number_of_mismatches); -- cgit v1.2.3 From d53f1fafec9d086f1c5166436abefdaef30e0363 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 24 Feb 2020 10:20:31 +0100 Subject: dm writecache: do direct write if the cache is full If the cache device is full, we do a direct write to the origin device. Note that we must not do it if the written block is already in the cache. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index a09bdc000e64..dc1e10c6ecd7 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1194,6 +1194,7 @@ read_next_block: } } else { do { + bool found_entry = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); @@ -1204,9 +1205,21 @@ read_next_block: wc->overwrote_committed = true; goto bio_copy; } + found_entry = true; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { + if (!found_entry) { + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e) { + sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; + BUG_ON(!next_boundary); + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { + dm_accept_partial_bio(bio, next_boundary); + } + } + goto unlock_remap_origin; + } writecache_wait_on_freelist(wc); continue; } -- cgit v1.2.3 From 93de44eb3fc8c3566f5315b0210630cc361526a7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 24 Feb 2020 10:20:32 +0100 Subject: dm writecache: implement the "cleaner" policy The "flush" or "flush_on_suspend" messages flush the whole cache. However, these flushing methods can take some time and the process is left in an interruptible state during the flush. Implement a "cleaner" option that offers an alternate flushing method. When this option is activated (either by a message or in the constructor arguments), the cache will not promote new writes (however, writes to already cached blocks are promoted, to avoid data corruption due to misordered writes) and it will gradually writeback any cached data. The userspace can then monitor the cleaning process with "dmsetup status". When the number of cached bloks drops to zero, the userspace can unload the dm-writecache target and replace it with dm-linear or other targets. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 48 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index dc1e10c6ecd7..3f17dcc6b73e 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -160,6 +160,7 @@ struct dm_writecache { bool autocommit_time_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; + bool cleaner:1; unsigned writeback_all; struct workqueue_struct *writeback_wq; @@ -1021,6 +1022,28 @@ static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_w return 0; } +static void activate_cleaner(struct dm_writecache *wc) +{ + wc->flush_on_suspend = true; + wc->cleaner = true; + wc->freelist_high_watermark = wc->n_blocks; + wc->freelist_low_watermark = wc->n_blocks; +} + +static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + activate_cleaner(wc); + if (!dm_suspended(wc->ti)) + writecache_verify_watermark(wc); + wc_unlock(wc); + + return 0; +} + static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, char *result, unsigned maxlen) { @@ -1031,6 +1054,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, r = process_flush_mesg(argc, argv, wc); else if (!strcasecmp(argv[0], "flush_on_suspend")) r = process_flush_on_suspend_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "cleaner")) + r = process_cleaner_mesg(argc, argv, wc); else DMERR("unrecognised message received: %s", argv[0]); @@ -1206,10 +1231,14 @@ read_next_block: goto bio_copy; } found_entry = true; + } else { + if (unlikely(wc->cleaner)) + goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { if (!found_entry) { +direct_write: e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); if (e) { sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; @@ -2071,6 +2100,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { wc->writeback_fua = true; @@ -2248,6 +2279,9 @@ overflow: do_div(x, 100); wc->freelist_low_watermark = x; + if (wc->cleaner) + activate_cleaner(wc); + r = writecache_alloc_entries(wc); if (r) { ti->error = "Cannot allocate memory"; @@ -2291,9 +2325,9 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args = 0; if (wc->start_sector) extra_args += 2; - if (wc->high_wm_percent_set) + if (wc->high_wm_percent_set && !wc->cleaner) extra_args += 2; - if (wc->low_wm_percent_set) + if (wc->low_wm_percent_set && !wc->cleaner) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2301,19 +2335,21 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; + if (wc->cleaner) + extra_args++; if (wc->writeback_fua_set) extra_args++; DMEMIT("%u", extra_args); if (wc->start_sector) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set) { + if (wc->high_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_high_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); DMEMIT(" high_watermark %u", 100 - (unsigned)x); } - if (wc->low_wm_percent_set) { + if (wc->low_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_low_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); @@ -2325,6 +2361,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); + if (wc->cleaner) + DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); break; @@ -2333,7 +2371,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, -- cgit v1.2.3 From 3923d4854e189d84c6ec22e66d536d3498f2747c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 24 Feb 2020 10:20:33 +0100 Subject: dm writecache: implement gradual cleanup If a block is stored in the cache for too long, it will now be written to the underlying device and cleaned up. Add a new option "max_age" that specifies the maximum age of a block in milliseconds. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 3f17dcc6b73e..e5c7b9072dd0 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -26,6 +26,8 @@ #define AUTOCOMMIT_BLOCKS_SSD 65536 #define AUTOCOMMIT_BLOCKS_PMEM 64 #define AUTOCOMMIT_MSEC 1000 +#define MAX_AGE_DIV 16 +#define MAX_AGE_UNSPECIFIED -1UL #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -88,6 +90,7 @@ struct wc_entry { :47 #endif ; + unsigned long age; #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS uint64_t original_sector; uint64_t seq_count; @@ -119,6 +122,7 @@ struct dm_writecache { size_t writeback_size; size_t freelist_high_watermark; size_t freelist_low_watermark; + unsigned long max_age; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -130,6 +134,8 @@ struct dm_writecache { struct timer_list autocommit_timer; struct wait_queue_head freelist_wait; + struct timer_list max_age_timer; + atomic_t bio_in_progress[2]; struct wait_queue_head bio_in_progress_wait[2]; @@ -597,6 +603,7 @@ static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *i rb_link_node(&ins->rb_node, parent, node); rb_insert_color(&ins->rb_node, &wc->tree); list_add(&ins->lru, &wc->lru); + ins->age = jiffies; } static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) @@ -632,6 +639,16 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc) queue_work(wc->writeback_wq, &wc->writeback_work); } +static void writecache_max_age_timer(struct timer_list *t) +{ + struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + + if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { + queue_work(wc->writeback_wq, &wc->writeback_work); + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + } +} + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -838,6 +855,7 @@ static void writecache_suspend(struct dm_target *ti) bool flush_on_suspend; del_timer_sync(&wc->autocommit_timer); + del_timer_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); @@ -974,6 +992,9 @@ erase_this: writecache_verify_watermark(wc); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + wc_unlock(wc); } @@ -1661,7 +1682,9 @@ restart: wbl.size = 0; while (!list_empty(&wc->lru) && (wc->writeback_all || - wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || + (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= + wc->max_age - wc->max_age / MAX_AGE_DIV))) { n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && @@ -1924,9 +1947,11 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->ti = ti; mutex_init(&wc->lock); + wc->max_age = MAX_AGE_UNSPECIFIED; writecache_poison_lists(wc); init_waitqueue_head(&wc->freelist_wait); timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); + timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); for (i = 0; i < 2; i++) { atomic_set(&wc->bio_in_progress[i], 0); @@ -2100,6 +2125,14 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { + unsigned max_age_msecs; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) + goto invalid_optional; + if (max_age_msecs > 86400000) + goto invalid_optional; + wc->max_age = msecs_to_jiffies(max_age_msecs); } else if (!strcasecmp(string, "cleaner")) { wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { @@ -2361,6 +2394,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); if (wc->cleaner) DMEMIT(" cleaner"); if (wc->writeback_fua_set) -- cgit v1.2.3 From dc8a01ae1dbd7bac98368da4d8f81632512429f5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 24 Feb 2020 10:20:34 +0100 Subject: dm writecache: optimize superblock write If we write a superblock in writecache_flush, we don't need to set bit and scan the bitmap for it - we can just write the superblock directly. Also, we can set the flag REQ_FUA on the write bio, so that we don't need to submit a flush bio afterwards. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e5c7b9072dd0..e274e5a4d425 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -509,6 +509,34 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } +static void ssd_commit_superblock(struct dm_writecache *wc) +{ + int r; + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = 0; + region.count = PAGE_SIZE; + + if (unlikely(region.sector + region.count > wc->metadata_sectors)) + region.count = wc->metadata_sectors - region.sector; + + region.sector += wc->start_sector; + + req.bi_op = REQ_OP_WRITE; + req.bi_op_flags = REQ_SYNC | REQ_FUA; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + req.notify.context = NULL; + + r = dm_io(&req, 1, ®ion, NULL); + if (unlikely(r)) + writecache_error(wc, r, "error writing superblock"); +} + static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) @@ -759,8 +787,10 @@ static void writecache_flush(struct dm_writecache *wc) wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc, false); + if (WC_MODE_PMEM(wc)) + writecache_commit_flushed(wc, false); + else + ssd_commit_superblock(wc); wc->overwrote_committed = false; -- cgit v1.2.3 From 75fa601934fda23d2f15bf44b09c2401942d8e15 Mon Sep 17 00:00:00 2001 From: "Shetty, Harshini X (EXT-Sony Mobile)" Date: Tue, 17 Mar 2020 09:15:45 +0000 Subject: dm verity fec: fix memory leak in verity_fec_dtr Fix below kmemleak detected in verity_fec_ctr. output_pool is allocated for each dm-verity-fec device. But it is not freed when dm-table for the verity target is removed. Hence free the output mempool in destructor function verity_fec_dtr. unreferenced object 0xffffffffa574d000 (size 4096): comm "init", pid 1667, jiffies 4294894890 (age 307.168s) hex dump (first 32 bytes): 8e 36 00 98 66 a8 0b 9b 00 00 00 00 00 00 00 00 .6..f........... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000060e82407>] __kmalloc+0x2b4/0x340 [<00000000dd99488f>] mempool_kmalloc+0x18/0x20 [<000000002560172b>] mempool_init_node+0x98/0x118 [<000000006c3574d2>] mempool_init+0x14/0x20 [<0000000008cb266e>] verity_fec_ctr+0x388/0x3b0 [<000000000887261b>] verity_ctr+0x87c/0x8d0 [<000000002b1e1c62>] dm_table_add_target+0x174/0x348 [<000000002ad89eda>] table_load+0xe4/0x328 [<000000001f06f5e9>] dm_ctl_ioctl+0x3b4/0x5a0 [<00000000bee5fbb7>] do_vfs_ioctl+0x5dc/0x928 [<00000000b475b8f5>] __arm64_sys_ioctl+0x70/0x98 [<000000005361e2e8>] el0_svc_common+0xa0/0x158 [<000000001374818f>] el0_svc_handler+0x6c/0x88 [<000000003364e9f4>] el0_svc+0x8/0xc [<000000009d84cec9>] 0xffffffffffffffff Fixes: a739ff3f543af ("dm verity: add support for forward error correction") Depends-on: 6f1c819c219f7 ("dm: convert to bioset_init()/mempool_init()") Cc: stable@vger.kernel.org Signed-off-by: Harshini Shetty Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-fec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ceeb6b404ed..49147e634046 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -551,6 +551,7 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); mempool_exit(&f->extra_pool); + mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); if (f->data_bufio) -- cgit v1.2.3 From b8fdd090376a7a46d17db316638fe54b965c2fb0 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 24 Mar 2020 21:22:45 +0800 Subject: dm zoned: remove duplicate nr_rnd_zones increase in dmz_init_zone() zmd->nr_rnd_zones was increased twice by mistake. The other place it is increased in dmz_init_zone() is the only one needed: 1131 zmd->nr_useable_zones++; 1132 if (dmz_is_rnd(zone)) { 1133 zmd->nr_rnd_zones++; ^^^ Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 516c7b671d25..369de15c4e80 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1109,7 +1109,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); - zmd->nr_rnd_zones++; break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: -- cgit v1.2.3 From b93b6643e9b5a7f260b931e97f56ffa3fa65e26d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:21 +0100 Subject: dm integrity: fix a crash with unusually large tag size If the user specifies tag size larger than HASH_MAX_DIGESTSIZE, there's a crash in integrity_metadata(). Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 66b8cfb83087..3cc12b55c34f 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1519,7 +1519,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[HASH_MAX_DIGESTSIZE]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; unsigned sectors_to_process = dio->range.n_sectors; sector_t sector = dio->range.logical_sector; @@ -1749,7 +1749,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { -- cgit v1.2.3 From 7649194a1636ab5876e7c18337d7ddd63e1d4376 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:22 +0100 Subject: dm integrity: remove sector type casts Since the commit 72deb455b5ec619ff043c30bc90025aa3de3cdda ("block: remove CONFIG_LBDAF") sector_t is always defined as unsigned long long. Delete the needless type casts in printk and avoids some warnings if DEBUG_PRINT is defined. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3cc12b55c34f..8ce69355b8dc 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -510,8 +510,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", - (unsigned long long)sector, - (unsigned long long)n_sectors, + sector, + n_sectors, ic->sb->log2_sectors_per_block, ic->log2_blocks_per_bitmap_bit, mode); @@ -1560,7 +1560,7 @@ again: if (r > 0) { char b[BDEVNAME_SIZE]; DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b), - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + (sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; atomic64_inc(&ic->number_of_mismatches); } @@ -1644,14 +1644,14 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", - (unsigned long long)dio->range.logical_sector, bio_sectors(bio), - (unsigned long long)ic->provided_data_sectors); + dio->range.logical_sector, bio_sectors(bio), + ic->provided_data_sectors); return DM_MAPIO_KILL; } if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", ic->sectors_per_block, - (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); + dio->range.logical_sector, bio_sectors(bio)); return DM_MAPIO_KILL; } @@ -1754,7 +1754,7 @@ retry_kmap: integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - (unsigned long long)logical_sector); + logical_sector); } } #endif @@ -2405,7 +2405,7 @@ next_chunk: get_area_and_offset(ic, logical_sector, &area, &offset); } - DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); + DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); @@ -2899,7 +2899,7 @@ static void dm_integrity_resume(struct dm_target *ti) DEBUG_print("testing recalc: %x\n", ic->sb->flags); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); - DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); + DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2929,10 +2929,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: DMEMIT("%llu %llu", - (unsigned long long)atomic64_read(&ic->number_of_mismatches), - (unsigned long long)ic->provided_data_sectors); + atomic64_read(&ic->number_of_mismatches), + ic->provided_data_sectors); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) - DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector)); + DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector)); else DMEMIT(" -"); break; @@ -2953,7 +2953,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; - DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, + DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) DMEMIT(" meta_device:%s", ic->meta_dev->name); @@ -2969,7 +2969,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" commit_time:%u", ic->autocommit_msec); } if (ic->mode == 'B') { - DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) @@ -3995,10 +3995,9 @@ try_smaller_buffer: DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); - DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, - (unsigned long long)ic->provided_data_sectors); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); - DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); + DEBUG_print(" bits_in_journal %llu\n", bits_in_journal); if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); -- cgit v1.2.3 From f6f72f32c22c0ba7b714685b13a257be981888f3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:23 +0100 Subject: dm integrity: don't replay journal data past the end of the device Following commits will make it possible to shrink or extend the device. If the device was shrunk, we don't want to replay journal data pointing past the end of the device. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 8ce69355b8dc..3c10a672322f 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2194,6 +2194,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, sec &= ~(sector_t)(ic->sectors_per_block - 1); } } + if (unlikely(sec >= ic->provided_data_sectors)) + continue; get_area_and_offset(ic, sec, &area, &offset); restore_last_bytes(ic, access_journal_data(ic, i, j), je); for (k = j + 1; k < ic->journal_section_entries; k++) { @@ -2203,6 +2205,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, break; BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); sec2 = journal_entry_get_sector(je2); + if (unlikely(sec2 >= ic->provided_data_sectors)) + break; get_area_and_offset(ic, sec2, &area2, &offset2); if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) break; -- cgit v1.2.3 From 87fb177b4cab154bc13efa14149e152628aeb37c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:24 +0100 Subject: dm integrity: factor out get_provided_data_sectors() Move code to a new function get_provided_data_sectors(). Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3c10a672322f..90a9a5e62623 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3078,6 +3078,24 @@ static int calculate_device_limits(struct dm_integrity_c *ic) return 0; } +static void get_provided_data_sectors(struct dm_integrity_c *ic) +{ + if (!ic->meta_dev) { + int test_bit; + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + } else { + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + } +} + static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) { unsigned journal_sections; @@ -3105,20 +3123,15 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->provided_data_sectors = 0; - for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { - __u64 prev_data_sectors = ic->provided_data_sectors; - - ic->provided_data_sectors |= (sector_t)1 << test_bit; - if (calculate_device_limits(ic)) - ic->provided_data_sectors = prev_data_sectors; - } + get_provided_data_sectors(ic); if (!ic->provided_data_sectors) return -EINVAL; } else { ic->sb->log2_interleave_sectors = 0; - ic->provided_data_sectors = ic->data_device_sectors; - ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) + return -EINVAL; try_smaller_buffer: ic->sb->journal_sections = cpu_to_le32(0); -- cgit v1.2.3 From 1ac2c15a7bf859b92de269ba58597c94ce10811f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:25 +0100 Subject: dm integrity: allow resize of the integrity device If the size of the underlying device changes, change the size of the integrity device too. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 90a9a5e62623..fafd9ec1d56c 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2833,9 +2833,29 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); int r; + DEBUG_print("resume\n"); + if (ic->provided_data_sectors != old_provided_data_sectors) { + if (ic->provided_data_sectors > old_provided_data_sectors && + ic->mode == 'B' && + ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + block_bitmap_op(ic, ic->journal, old_provided_data_sectors, + ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { DEBUG_print("resume dirty_bitmap\n"); rw_journal_sectors(ic, REQ_OP_READ, 0, 0, @@ -3938,16 +3958,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } - ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); - if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { - /* test for overflow */ + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { r = -EINVAL; - ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; + ti->error = "Journal mac mismatch"; goto bad; } - if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) { r = -EINVAL; - ti->error = "Journal mac mismatch"; + ti->error = "The device is too small"; goto bad; } @@ -4219,7 +4239,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, -- cgit v1.2.3 From 84597a44a9d86ac949900441cea7da0af0f2f473 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:26 +0100 Subject: dm integrity: add optional discard support Add an argument "allow_discards" that enables discard processing on dm-integrity device. Discards are only allowed to devices using internal hash. When a block is discarded the integrity tag is filled with DISCARD_FILLER (0xf6) bytes. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 177 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 145 insertions(+), 32 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index fafd9ec1d56c..21eb35c606be 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -39,6 +39,7 @@ #define RECALC_WRITE_SUPER 16 #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) +#define DISCARD_FILLER 0xf6 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -257,6 +258,7 @@ struct dm_integrity_c { bool just_formatted; bool recalculate_flag; bool fix_padding; + bool discard; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -284,7 +286,7 @@ struct dm_integrity_io { struct work_struct work; struct dm_integrity_c *ic; - bool write; + enum req_opf op; bool fua; struct dm_integrity_range range; @@ -1299,6 +1301,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned *metadata_offset, unsigned total_size, int op) { +#define MAY_BE_FILLER 1 +#define MAY_BE_HASH 2 + unsigned hash_offset = 0; + unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + do { unsigned char *data, *dp; struct dm_buffer *b; @@ -1320,18 +1327,35 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se } else if (op == TAG_WRITE) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); - } else { + } else { /* e.g.: op == TAG_CMP */ - if (unlikely(memcmp(dp, tag, to_copy))) { - unsigned i; - for (i = 0; i < to_copy; i++) { - if (dp[i] != tag[i]) - break; - total_size--; + if (likely(is_power_of_2(ic->tag_size))) { + if (unlikely(memcmp(dp, tag, to_copy))) + if (unlikely(!ic->discard) || + unlikely(!memchr_inv(dp, DISCARD_FILLER, to_copy))) { + goto thorough_test; + } + } else { + unsigned i, ts; +thorough_test: + ts = total_size; + + for (i = 0; i < to_copy; i++, ts--) { + if (unlikely(dp[i] != tag[i])) + may_be &= ~MAY_BE_HASH; + if (likely(dp[i] != DISCARD_FILLER)) + may_be &= ~MAY_BE_FILLER; + hash_offset++; + if (unlikely(hash_offset == ic->tag_size)) { + if (unlikely(!may_be)) { + dm_bufio_release(b); + return ts; + } + hash_offset = 0; + may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + } } - dm_bufio_release(b); - return total_size; } } dm_bufio_release(b); @@ -1342,10 +1366,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se (*metadata_block)++; *metadata_offset = 0; } + + if (unlikely(!is_power_of_2(ic->tag_size))) { + hash_offset = (hash_offset + to_copy) % ic->tag_size; + } + total_size -= to_copy; } while (unlikely(total_size)); return 0; +#undef MAY_BE_FILLER +#undef MAY_BE_HASH } static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) @@ -1428,7 +1459,7 @@ static void dec_in_flight(struct dm_integrity_io *dio) remove_range(ic, &dio->range); - if (unlikely(dio->write)) + if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD)) schedule_autocommit(ic); bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1520,14 +1551,19 @@ static void integrity_metadata(struct work_struct *w) char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; - unsigned sectors_to_process = dio->range.n_sectors; - sector_t sector = dio->range.logical_sector; + sector_t sector; + unsigned sectors_to_process; + sector_t save_metadata_block; + unsigned save_metadata_offset; if (unlikely(ic->mode == 'R')) goto skip_io; - checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (likely(dio->op != REQ_OP_DISCARD)) + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + else + checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); if (!checksums) { checksums = checksums_onstack; if (WARN_ON(extra_space && @@ -1537,6 +1573,43 @@ static void integrity_metadata(struct work_struct *w) } } + if (unlikely(dio->op == REQ_OP_DISCARD)) { + sector_t bi_sector = dio->bio_details.bi_iter.bi_sector; + unsigned bi_size = dio->bio_details.bi_iter.bi_size; + unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE; + unsigned max_blocks = max_size / ic->tag_size; + memset(checksums, DISCARD_FILLER, max_size); + + while (bi_size) { + unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + this_step_blocks = min(this_step_blocks, max_blocks); + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + this_step_blocks * ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + /*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) { + printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size); + printk("BUGG: this_step_blocks: %u\n", this_step_blocks); + BUG(); + }*/ + bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block; + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto skip_io; + } + + save_metadata_block = dio->metadata_block; + save_metadata_offset = dio->metadata_offset; + sector = dio->range.logical_sector; + sectors_to_process = dio->range.n_sectors; + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos; char *mem, *checksums_ptr; @@ -1555,7 +1628,7 @@ again: kunmap_atomic(mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, - checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); + checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { char b[BDEVNAME_SIZE]; @@ -1599,7 +1672,7 @@ again: tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, - this_len, !dio->write ? TAG_READ : TAG_WRITE); + this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); if (unlikely(r)) goto error; data_to_process -= this_len; @@ -1626,6 +1699,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; + dio->op = bio_op(bio); + + if (unlikely(dio->op == REQ_OP_DISCARD)) { + if (ti->max_io_len) { + sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned log2_max_io_len = __fls(ti->max_io_len); + sector_t start_boundary = sec >> log2_max_io_len; + sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len; + if (start_boundary < end_boundary) { + sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1)); + dm_accept_partial_bio(bio, len); + } + } + } if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); @@ -1633,8 +1720,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); - dio->write = bio_op(bio) == REQ_OP_WRITE; - dio->fua = dio->write && bio->bi_opf & REQ_FUA; + dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA; if (unlikely(dio->fua)) { /* * Don't pass down the FUA flag because we have to flush @@ -1655,7 +1741,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_KILL; } - if (ic->sectors_per_block > 1) { + if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { @@ -1688,7 +1774,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } } - if (unlikely(ic->mode == 'R') && unlikely(dio->write)) + if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ)) return DM_MAPIO_KILL; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); @@ -1718,13 +1804,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); retry_kmap: mem = kmap_atomic(bv.bv_page); - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) flush_dcache_page(bv.bv_page); do { struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); - if (unlikely(!dio->write)) { + if (unlikely(dio->op == REQ_OP_READ)) { struct journal_sector *js; char *mem_ptr; unsigned s; @@ -1771,7 +1857,7 @@ retry_kmap: char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else memcpy(tag_addr, tag_ptr, tag_now); @@ -1779,12 +1865,12 @@ retry_kmap: tag_ptr += tag_now; tag_todo -= tag_now; } while (unlikely(tag_todo)); else { - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memset(tag_ptr, 0, tag_todo); } } - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { struct journal_sector *js; unsigned s; @@ -1820,12 +1906,12 @@ retry_kmap: bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); - if (unlikely(!dio->write)) + if (unlikely(dio->op == REQ_OP_READ)) flush_dcache_page(bv.bv_page); kunmap_atomic(mem); } while (n_sectors); - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); @@ -1857,7 +1943,9 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map unsigned journal_section, journal_entry; unsigned journal_read_pos; struct completion read_comp; - bool need_sync_io = ic->internal_hash && !dio->write; + bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ; + if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D') + need_sync_io = true; if (need_sync_io && from_map) { INIT_WORK(&dio->work, integrity_bio_wait); @@ -1875,8 +1963,8 @@ retry: } dio->range.n_sectors = bio_sectors(bio); journal_read_pos = NOT_FOUND; - if (likely(ic->mode == 'J')) { - if (dio->write) { + if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) { + if (dio->op == REQ_OP_WRITE) { unsigned next_entry, i, pos; unsigned ws, we, range_sectors; @@ -1979,7 +2067,7 @@ offload_to_thread: goto journal_read_write; } - if (ic->mode == 'B' && dio->write) { + if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) { if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { struct bitmap_block_status *bbs; @@ -2008,6 +2096,18 @@ offload_to_thread: bio->bi_end_io = integrity_end_io; bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) { + integrity_metadata(&dio->work); + dm_integrity_flush_buffers(ic); + + dio->in_flight = (atomic_t)ATOMIC_INIT(1); + dio->completion = NULL; + + generic_make_request(bio); + + return; + } + generic_make_request(bio); if (need_sync_io) { @@ -2969,6 +3069,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->discard; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'B'; @@ -2985,6 +3086,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); + if (ic->discard) + DMEMIT(" allow_discards"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); @@ -3771,6 +3874,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "allow_discards")) { + ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { ic->fix_padding = true; } else { @@ -3829,6 +3934,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + if (ic->discard && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Discard can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -4158,6 +4269,8 @@ try_smaller_buffer: ti->num_flush_bios = 1; ti->flush_supported = true; + if (ic->discard) + ti->num_discard_bios = 1; return 0; -- cgit v1.2.3 From 31843edab7cb3924006544b901c9cab33941b684 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 22 Mar 2020 20:42:27 +0100 Subject: dm integrity: improve discard in journal mode When we discard something that is present in the journal, we flush the journal first, so that discarded blocks are not overwritten by the journal content. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 21eb35c606be..b989d109d55d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1943,6 +1943,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map unsigned journal_section, journal_entry; unsigned journal_read_pos; struct completion read_comp; + bool discard_retried = false; bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ; if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D') need_sync_io = true; @@ -2059,6 +2060,21 @@ offload_to_thread: } } } + if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) { + sector_t next_sector; + unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != NOT_FOUND) || + unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) { + remove_range_unlocked(ic, &dio->range); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); + flush_workqueue(ic->commit_wq); + queue_work(ic->writer_wq, &ic->writer_work); + flush_workqueue(ic->writer_wq); + discard_retried = true; + goto lock_retry; + } + } spin_unlock_irq(&ic->endio_wait.lock); if (unlikely(journal_read_pos != NOT_FOUND)) { -- cgit v1.2.3 From 1edaa447d958bec24c6a79685a5790d98976fd16 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Mar 2020 07:22:36 -0400 Subject: dm writecache: add cond_resched to avoid CPU hangs Initializing a dm-writecache device can take a long time when the persistent memory device is large. Add cond_resched() to a few loops to avoid warnings that the CPU is stuck. Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e274e5a4d425..114927da9cc9 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -925,6 +925,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc) struct wc_entry *e = &wc->entries[b]; e->index = b; e->write_in_progress = false; + cond_resched(); } return 0; @@ -979,6 +980,7 @@ static void writecache_resume(struct dm_target *ti) e->original_sector = le64_to_cpu(wme.original_sector); e->seq_count = le64_to_cpu(wme.seq_count); } + cond_resched(); } #endif for (b = 0; b < wc->n_blocks; b++) { @@ -1886,8 +1888,10 @@ static int init_memory(struct dm_writecache *wc) pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); - for (b = 0; b < wc->n_blocks; b++) + for (b = 0; b < wc->n_blocks; b++) { write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); + cond_resched(); + } writecache_flush_all_metadata(wc); writecache_commit_flushed(wc, false); -- cgit v1.2.3 From 4b5142905d4ff58a4b93f7c8eaa7ba829c0a53c9 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 27 Mar 2020 16:01:08 +0200 Subject: dm clone: Fix handling of partial region discards There is a bug in the way dm-clone handles discards, which can lead to discarding the wrong blocks or trying to discard blocks beyond the end of the device. This could lead to data corruption, if the destination device indeed discards the underlying blocks, i.e., if the discard operation results in the original contents of a block to be lost. The root of the problem is the code that calculates the range of regions covered by a discard request and decides which regions to discard. Since dm-clone handles the device in units of regions, we don't discard parts of a region, only whole regions. The range is calculated as: rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); re = bio_end_sector(bio) >> clone->region_shift; , where 'rs' is the first region to discard and (re - rs) is the number of regions to discard. The bug manifests when we try to discard part of a single region, i.e., when we try to discard a block with size < region_size, and the discard request both starts at an offset with respect to the beginning of that region and ends before the end of the region. The root cause is the following comparison: if (rs == re) // skip discard and complete original bio immediately , which doesn't take into account that 'rs' might be greater than 're'. Thus, we then issue a discard request for the wrong blocks, instead of skipping the discard all together. Fix the check to also take into account the above case, so we don't end up discarding the wrong blocks. Also, add some range checks to dm_clone_set_region_hydrated() and dm_clone_cond_set_range(), which update dm-clone's region bitmap. Note that the aforementioned bug doesn't cause invalid memory accesses, because dm_clone_is_range_hydrated() returns True for this case, so the checks are just precautionary. Fixes: 7431b7835f55 ("dm: add clone target") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-metadata.c | 13 +++++++++++++ drivers/md/dm-clone-target.c | 43 ++++++++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index c05b12110456..199e7af00858 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -850,6 +850,12 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re struct dirty_map *dmap; unsigned long word, flags; + if (unlikely(region_nr >= cmd->nr_regions)) { + DMERR("Region %lu out of range (total number of regions %lu)", + region_nr, cmd->nr_regions); + return -ERANGE; + } + word = region_nr / BITS_PER_LONG; spin_lock_irqsave(&cmd->bitmap_lock, flags); @@ -879,6 +885,13 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, struct dirty_map *dmap; unsigned long word, region_nr; + if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start || + (start + nr_regions) > cmd->nr_regions)) { + DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)", + start, nr_regions, cmd->nr_regions); + return -ERANGE; + } + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index d1e1b5b56b1b..022dddcad647 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -293,10 +293,17 @@ static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) /* Get the region range covered by the bio */ static void bio_region_range(struct clone *clone, struct bio *bio, - unsigned long *rs, unsigned long *re) + unsigned long *rs, unsigned long *nr_regions) { + unsigned long end; + *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); - *re = bio_end_sector(bio) >> clone->region_shift; + end = bio_end_sector(bio) >> clone->region_shift; + + if (*rs >= end) + *nr_regions = 0; + else + *nr_regions = end - *rs; } /* Check whether a bio overwrites a region */ @@ -454,7 +461,7 @@ static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) { - unsigned long rs, re; + unsigned long rs, nr_regions; /* * If the destination device supports discards, remap and trim the @@ -463,9 +470,9 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ */ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { remap_to_dest(clone, bio); - bio_region_range(clone, bio, &rs, &re); + bio_region_range(clone, bio, &rs, &nr_regions); trim_bio(bio, rs << clone->region_shift, - (re - rs) << clone->region_shift); + nr_regions << clone->region_shift); generic_make_request(bio); } else bio_endio(bio); @@ -473,12 +480,21 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re; + unsigned long rs, nr_regions; - bio_region_range(clone, bio, &rs, &re); - BUG_ON(re > clone->nr_regions); + bio_region_range(clone, bio, &rs, &nr_regions); + if (!nr_regions) { + bio_endio(bio); + return; + } - if (unlikely(rs == re)) { + if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs || + (rs + nr_regions) > clone->nr_regions)) { + DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)", + clone_device_name(clone), rs, nr_regions, + clone->nr_regions, + (unsigned long long)bio->bi_iter.bi_sector, + bio_sectors(bio)); bio_endio(bio); return; } @@ -487,7 +503,7 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) * The covered regions are already hydrated so we just need to pass * down the discard. */ - if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { + if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) { complete_discard_bio(clone, bio, true); return; } @@ -1169,7 +1185,7 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re; + unsigned long rs, nr_regions; struct bio_list discards = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); @@ -1185,14 +1201,13 @@ static void process_deferred_discards(struct clone *clone) /* Update the metadata */ bio_list_for_each(bio, &discards) { - bio_region_range(clone, bio, &rs, &re); + bio_region_range(clone, bio, &rs, &nr_regions); /* * A discard request might cover regions that have been already * hydrated. There is no need to update the metadata for these * regions. */ - r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); - + r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); if (unlikely(r)) break; } -- cgit v1.2.3 From cd481c12269b4d276f1a52eda0ebd419079bfe3a Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 27 Mar 2020 16:01:09 +0200 Subject: dm clone: Add overflow check for number of regions Add overflow check for clone->nr_regions variable, which holds the number of regions of the target. The overflow can occur with sufficiently large devices, if BITS_PER_LONG == 32. E.g., if the region size is 8 sectors (4K), the overflow would occur for device sizes > 34359738360 sectors (~16TB). This could result in multiple device sectors wrongly mapping to the same region number, due to the truncation from 64 bits to 32 bits, which would lead to data corruption. Fixes: 7431b7835f55 ("dm: add clone target") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-target.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 022dddcad647..6ee85fb3388a 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1790,6 +1790,7 @@ error: static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; + sector_t nr_regions; struct clone *clone; struct dm_arg_set as; @@ -1831,7 +1832,16 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto out_with_source_dev; clone->region_shift = __ffs(clone->region_size); - clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); + nr_regions = dm_sector_div_up(ti->len, clone->region_size); + + /* Check for overflow */ + if (nr_regions != (unsigned long)nr_regions) { + ti->error = "Too many regions. Consider increasing the region size"; + r = -EOVERFLOW; + goto out_with_source_dev; + } + + clone->nr_regions = nr_regions; r = validate_nr_regions(clone->nr_regions, &ti->error); if (r) -- cgit v1.2.3 From 9fc06ff56845cc5ccafec52f545fc2e08d22f849 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 27 Mar 2020 16:01:10 +0200 Subject: dm clone: Add missing casts to prevent overflows and data corruption Add missing casts when converting from regions to sectors. In case BITS_PER_LONG == 32, the lack of the appropriate casts can lead to overflows and miscalculation of the device sector. As a result, we could end up discarding and/or copying the wrong parts of the device, thus corrupting the device's data. Fixes: 7431b7835f55 ("dm: add clone target") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-target.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 6ee85fb3388a..ca5020c58f7c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -282,7 +282,7 @@ static bool bio_triggers_commit(struct clone *clone, struct bio *bio) /* Get the address of the region in sectors */ static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) { - return (region_nr << clone->region_shift); + return ((sector_t)region_nr << clone->region_shift); } /* Get the region number of the bio */ @@ -471,7 +471,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { remap_to_dest(clone, bio); bio_region_range(clone, bio, &rs, &nr_regions); - trim_bio(bio, rs << clone->region_shift, + trim_bio(bio, region_to_sector(clone, rs), nr_regions << clone->region_shift); generic_make_request(bio); } else @@ -804,11 +804,14 @@ static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr struct dm_io_region from, to; struct clone *clone = hd->clone; + if (WARN_ON(!nr_regions)) + return; + region_size = clone->region_size; region_start = hd->region_nr; region_end = region_start + nr_regions - 1; - total_size = (nr_regions - 1) << clone->region_shift; + total_size = region_to_sector(clone, nr_regions - 1); if (region_end == clone->nr_regions - 1) { /* -- cgit v1.2.3 From 81d5553d1288c2ec0390f02f84d71ca0f0f9f137 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 27 Mar 2020 16:01:11 +0200 Subject: dm clone metadata: Fix return type of dm_clone_nr_of_hydrated_regions() dm_clone_nr_of_hydrated_regions() returns the number of regions that have been hydrated so far. In order to do so it employs bitmap_weight(). Until now, the return type of dm_clone_nr_of_hydrated_regions() was unsigned long. Because bitmap_weight() returns an int, in case BITS_PER_LONG == 64 and the return value of bitmap_weight() is 2^31 (the maximum allowed number of regions for a device), the result is sign extended from 32 bits to 64 bits and an incorrect value is displayed, in the status output of dm-clone, as the number of hydrated regions. Fix this by having dm_clone_nr_of_hydrated_regions() return an unsigned int. Fixes: 7431b7835f55 ("dm: add clone target") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-metadata.c | 2 +- drivers/md/dm-clone-metadata.h | 2 +- drivers/md/dm-clone-target.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 199e7af00858..17712456fa63 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -656,7 +656,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, return (bit >= (start + nr_regions)); } -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) { return bitmap_weight(cmd->region_map, cmd->nr_regions); } diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 14af1ebd853f..d848b8799c07 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -156,7 +156,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, /* * Returns the number of hydrated regions. */ -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); /* * Returns the first unhydrated region with region_nr >= @start diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index ca5020c58f7c..5ce96ddf1ce1 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1473,7 +1473,7 @@ static void clone_status(struct dm_target *ti, status_type_t type, goto error; } - DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", + DMEMIT("%u %llu/%llu %llu %u/%lu %u ", DM_CLONE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), (unsigned long long)nr_metadata_blocks, -- cgit v1.2.3