diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-integrity.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 51 | ||||
-rw-r--r-- | drivers/md/dm-zoned-reclaim.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 12 | ||||
-rw-r--r-- | drivers/md/dm.c | 109 |
8 files changed, 143 insertions, 66 deletions
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ae866e469e1b..5da3eb661e50 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2420,7 +2420,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (unlikely(dm_suspended(ic->ti)) && !ic->meta_dev) + if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2481,7 +2481,7 @@ static void integrity_recalc(struct work_struct *w) next_chunk: - if (unlikely(dm_suspended(ic->ti))) + if (unlikely(dm_post_suspending(ic->ti))) goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index ac83f5002ce5..489935d5f22d 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1471,7 +1471,7 @@ static void retrieve_deps(struct dm_table *table, /* * Check we have enough space. */ - needed = sizeof(*deps) + (sizeof(*deps->dev) * count); + needed = struct_size(deps, dev, count); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; return; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5aec1cd09348..7ce387a1cc6a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -146,10 +146,6 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md) { - /* nudge anyone waiting on suspend queue */ - if (unlikely(wq_has_sleeper(&md->wait))) - wake_up(&md->wait); - /* * dm_put() must be at the end of this function. See the comment above */ diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 62421554b838..8aa306ebc2ab 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -282,6 +282,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) while (daa-- && i < p) { pages[i++] = pfn_t_to_page(pfn); pfn.val++; + if (!(i & 15)) + cond_resched(); } } while (i < p); wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); @@ -849,10 +851,14 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ if (likely(!e->write_in_progress)) { if (!discarded_something) { - writecache_wait_for_ios(wc, READ); - writecache_wait_for_ios(wc, WRITE); + if (!WC_MODE_PMEM(wc)) { + writecache_wait_for_ios(wc, READ); + writecache_wait_for_ios(wc, WRITE); + } discarded_something = true; } + if (!writecache_entry_is_committed(wc, e)) + wc->uncommitted_blocks--; writecache_free_entry(wc, e); } @@ -2260,6 +2266,12 @@ invalid_optional: } if (WC_MODE_PMEM(wc)) { + if (!dax_synchronous(wc->ssd_dev->dax_dev)) { + r = -EOPNOTSUPP; + ti->error = "Asynchronous persistent memory not supported as pmem cache"; + goto bad; + } + r = persistent_memory_claim(wc); if (r) { ti->error = "Unable to map persistent memory for cache"; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 130b5a6d9f12..b298fefb022e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1078,7 +1078,8 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) >> zmd->zone_nr_blocks_shift; if (!nr_meta_zones || - nr_meta_zones >= zmd->nr_rnd_zones) { + (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) || + (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) { dmz_dev_err(dev, "Invalid number of metadata blocks"); return -ENXIO; } @@ -1949,7 +1950,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, unsigned int idx, bool idle) { struct dm_zone *dzone = NULL; - struct dm_zone *zone, *last = NULL; + struct dm_zone *zone, *maxw_z = NULL; struct list_head *zone_list; /* If we have cache zones select from the cache zone list */ @@ -1961,18 +1962,37 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, } else zone_list = &zmd->dev[idx].map_rnd_list; + /* + * Find the buffer zone with the heaviest weight or the first (oldest) + * data zone that can be reclaimed. + */ list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) { dzone = zone->bzone; - if (dzone->dev->dev_idx != idx) - continue; - if (!last) { - last = dzone; + if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) continue; - } - if (last->weight < dzone->weight) + if (!maxw_z || maxw_z->weight < dzone->weight) + maxw_z = dzone; + } else { + dzone = zone; + if (dmz_lock_zone_reclaim(dzone)) + return dzone; + } + } + + if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) + return maxw_z; + + /* + * If we come here, none of the zones inspected could be locked for + * reclaim. Try again, being more aggressive, that is, find the + * first zone that can be reclaimed regardless of its weitght. + */ + list_for_each_entry(zone, zone_list, link) { + if (dmz_is_buf(zone)) { + dzone = zone->bzone; + if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) continue; - dzone = last; } else dzone = zone; if (dmz_lock_zone_reclaim(dzone)) @@ -2006,7 +2026,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, unsigned int dev_idx, bool idle) { - struct dm_zone *zone; + struct dm_zone *zone = NULL; /* * Search for a zone candidate to reclaim: 2 cases are possible. @@ -2019,7 +2039,7 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, dmz_lock_map(zmd); if (list_empty(&zmd->reserved_seq_zones_list)) zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); - else + if (!zone) zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); dmz_unlock_map(zmd); @@ -2197,8 +2217,15 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, { struct list_head *list; struct dm_zone *zone; - int i = 0; + int i; + + /* Schedule reclaim to ensure free zones are available */ + if (!(flags & DMZ_ALLOC_RECLAIM)) { + for (i = 0; i < zmd->nr_devs; i++) + dmz_schedule_reclaim(zmd->dev[i].reclaim); + } + i = 0; again: if (flags & DMZ_ALLOC_CACHE) list = &zmd->unmap_cache_list; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 2261b4dd60b7..9c0ecc9568a4 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -377,6 +377,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) dmz_metadata_label(zmd), zrc->dev_idx); return -EBUSY; } + rzone = dzone; start = jiffies; if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { @@ -391,8 +392,6 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) */ ret = dmz_reclaim_rnd_data(zrc, dzone); } - rzone = dzone; - } else { struct dm_zone *bzone = dzone->bzone; sector_t chunk_block = 0; @@ -415,7 +414,6 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) * be later reclaimed. */ ret = dmz_reclaim_seq_data(zrc, dzone); - rzone = dzone; } } out: @@ -458,6 +456,8 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); } + if (nr_unmap <= 1) + return 0; return nr_unmap * 100 / nr_zones; } @@ -503,7 +503,7 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; + unsigned int p_unmap; int ret; if (dmz_dev_is_dying(zmd)) @@ -529,9 +529,6 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); - nr_rnd = dmz_nr_rnd_zones(zmd, zrc->dev_idx); - DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->dev_idx, zrc->kc_throttle.throttle, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 05a3cfefe937..697f9de37355 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -400,15 +400,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; - int i, ret; - - /* - * Write may trigger a zone allocation. So make sure the - * allocation can succeed. - */ - if (bio_op(bio) == REQ_OP_WRITE) - for (i = 0; i < dmz->nr_ddevs; i++) - dmz_schedule_reclaim(dmz->dev[i].reclaim); + int ret; dmz_lock_metadata(zmd); @@ -890,7 +882,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Set target (no write same support) */ - ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9; + ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata); ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_zeroes_bios = 1; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e2148fcb88bb..87cf45f619fd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> @@ -142,6 +143,7 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_DEFERRED_REMOVE 6 #define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; @@ -654,28 +656,6 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -static bool md_in_flight_bios(struct mapped_device *md) -{ - int cpu; - struct hd_struct *part = &dm_disk(md)->part0; - long sum = 0; - - for_each_possible_cpu(cpu) { - sum += part_stat_local_read_cpu(part, in_flight[0], cpu); - sum += part_stat_local_read_cpu(part, in_flight[1], cpu); - } - - return sum != 0; -} - -static bool md_in_flight(struct mapped_device *md) -{ - if (queue_is_mq(md->queue)) - return blk_mq_queue_inflight(md->queue); - else - return md_in_flight_bios(md); -} - u64 dm_start_time_ns_from_clone(struct bio *bio) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); @@ -1009,6 +989,7 @@ static void clone_endio(struct bio *bio) struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; + struct bio *orig_bio = io->orig_bio; if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) { if (bio_op(bio) == REQ_OP_DISCARD && @@ -1022,6 +1003,18 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } + /* + * For zone-append bios get offset in zone of the written + * sector and add that to the original bio sector pos. + */ + if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { + sector_t written_sector = bio->bi_iter.bi_sector; + struct request_queue *q = orig_bio->bi_disk->queue; + u64 mask = (u64)blk_queue_zone_sectors(q) - 1; + + orig_bio->bi_iter.bi_sector += written_sector & mask; + } + if (endio) { int r = endio(tio->ti, bio, &error); switch (r) { @@ -2378,6 +2371,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); } /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ @@ -2418,15 +2412,29 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md, long task_state) +static bool md_in_flight_bios(struct mapped_device *md) +{ + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; + + for_each_possible_cpu(cpu) { + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + + return sum != 0; +} + +static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state) { int r = 0; DEFINE_WAIT(wait); - while (1) { + while (true) { prepare_to_wait(&md->wait, &wait, task_state); - if (!md_in_flight(md)) + if (!md_in_flight_bios(md)) break; if (signal_pending_state(task_state, current)) { @@ -2441,6 +2449,28 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state) return r; } +static int dm_wait_for_completion(struct mapped_device *md, long task_state) +{ + int r = 0; + + if (!queue_is_mq(md->queue)) + return dm_wait_for_bios_completion(md, task_state); + + while (true) { + if (!blk_mq_queue_inflight(md->queue)) + break; + + if (signal_pending_state(task_state, current)) { + r = -EINTR; + break; + } + + msleep(5); + } + + return r; +} + /* * Process the deferred bios */ @@ -2700,7 +2730,9 @@ retry: if (r) goto out_unlock; + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); out_unlock: mutex_unlock(&md->suspend_lock); @@ -2797,7 +2829,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, DMF_SUSPENDED_INTERNALLY); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); } static void __dm_internal_resume(struct mapped_device *md) @@ -2874,17 +2908,25 @@ EXPORT_SYMBOL_GPL(dm_internal_resume_fast); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie) { + int r; + unsigned noio_flag; char udev_cookie[DM_COOKIE_LENGTH]; char *envp[] = { udev_cookie, NULL }; + noio_flag = memalloc_noio_save(); + if (!cookie) - return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); else { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); - return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, - action, envp); + r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); } + + memalloc_noio_restore(noio_flag); + + return r; } uint32_t dm_next_uevent_seq(struct mapped_device *md) @@ -2950,6 +2992,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +static int dm_post_suspending_md(struct mapped_device *md) +{ + return test_bit(DMF_POST_SUSPENDING, &md->flags); +} + int dm_suspended_internally_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); @@ -2966,6 +3013,12 @@ int dm_suspended(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_suspended); +int dm_post_suspending(struct dm_target *ti) +{ + return dm_post_suspending_md(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_post_suspending); + int dm_noflush_suspending(struct dm_target *ti) { return __noflush_suspending(dm_table_get_md(ti->table)); |