summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-30 05:08:49 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-30 05:08:49 +0300
commite9f8ca0ae7b7bc9a032b429929431c626a69dd5e (patch)
treec29205e2f381879ae148b5ca9c23b4afa55ece05
parent05ef8b97ddf9aed40df977477daeab01760d7f9a (diff)
parent47ace7e012b9f7ad71d43ac9063d335ea3d6820b (diff)
downloadlinux-e9f8ca0ae7b7bc9a032b429929431c626a69dd5e.tar.xz
Merge tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Fix DM core's potential for q->make_request_fn NULL pointer in the unlikely case that a DM device is created without a DM table and then accessed due to upper-layer userspace code or user error. - Fix DM thin-provisioning's metadata_pre_commit_callback to not use memory after it is free'd. Also refactor code to disallow changing the thin-pool's data device once in use -- doing so guarantees smae lifetime of pool's data device relative to the pool metadata. - Fix DM space maps used by DM thinp and DM cache to avoid reuse of a already used block. This race was identified with extremely heavy snapshot use in the context of DM thin provisioning. - Fix DM raid's table status relative to an active rebuild. - Fix DM crypt to use GFP_NOIO rather than GFP_NOFS in call to skcipher_request_alloc(). Also fix benbi IV constructor crash if used in authenticated mode. - Add DM crypt support for Elephant diffuser to allow for Bitlocker compatibility. - Fix DM verity target to not prefetch hash blocks for data that has already been verified. - Fix DM writecache's incorrect flush sequence during commit when in SSD mode. - Improve DM writecache's sequential write performance on SSDs. - Add DM zoned target support for zone sizes smaller than 128MiB. - Add DM multipath 'queue_if_no_path_timeout_secs' module param to allow timeout if path isn't reinstated. This allows users a kernel safety-net against IO hanging indefinitely, due to no active paths, that has historically only been provided by multipathd userspace. - Various DM code cleanups to use true/false rather than 1/0, a variable rename in dm-dust, and fix for a math error in comment for DM thin metadata's ondisk format. * tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (21 commits) dm: fix potential for q->make_request_fn NULL pointer dm writecache: improve performance of large linear writes on SSDs dm mpath: Add timeout mechanism for queue_if_no_path dm thin: change data device's flush_bio to be member of struct pool dm thin: don't allow changing data device during thin-pool reload dm thin: fix use-after-free in metadata_pre_commit_callback dm thin metadata: use pool locking at end of dm_pool_metadata_close dm writecache: fix incorrect flush sequence when doing SSD mode commit dm crypt: fix benbi IV constructor crash if used in authenticated mode dm crypt: Implement Elephant diffuser for Bitlocker compatibility dm space map common: fix to ensure new block isn't already in use dm verity: don't prefetch hash blocks for already-verified data dm crypt: fix GFP flags passed to skcipher_request_alloc() dm thin metadata: Fix trivial math error in on-disk format documentation dm thin metadata: use true/false for bool variable dm snapshot: use true/false for bool variable dm bio prison v2: use true/false for bool variable dm mpath: use true/false for bool variable dm zoned: support zone sizes smaller than 128MiB dm raid: table line rebuild status fixes ...
-rw-r--r--Documentation/admin-guide/device-mapper/dm-raid.rst2
-rw-r--r--drivers/md/dm-bio-prison-v2.c2
-rw-r--r--drivers/md/dm-crypt.c335
-rw-r--r--drivers/md/dm-dust.c6
-rw-r--r--drivers/md/dm-mpath.c68
-rw-r--r--drivers/md/dm-raid.c43
-rw-r--r--drivers/md/dm-snap.c6
-rw-r--r--drivers/md/dm-thin-metadata.c22
-rw-r--r--drivers/md/dm-thin.c36
-rw-r--r--drivers/md/dm-verity-target.c18
-rw-r--r--drivers/md/dm-writecache.c71
-rw-r--r--drivers/md/dm-zoned-metadata.c23
-rw-r--r--drivers/md/dm.c9
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c27
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h2
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c5
17 files changed, 580 insertions, 101 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst
index f6344675e395..695a2ea1d1ae 100644
--- a/Documentation/admin-guide/device-mapper/dm-raid.rst
+++ b/Documentation/admin-guide/device-mapper/dm-raid.rst
@@ -419,3 +419,5 @@ Version History
rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions
+ 1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
+ on the status line.
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ee019eda32d..9dec3b61cf70 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios);
if (cell->shared_count) {
- cell->exclusive_lock = 0;
+ cell->exclusive_lock = false;
return false;
}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index eb9782fc93fe..c6a529873d0f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
+ * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
@@ -115,6 +115,11 @@ struct iv_tcw_private {
u8 *whitening;
};
+#define ELEPHANT_MAX_KEY_SIZE 32
+struct iv_elephant_private {
+ struct crypto_skcipher *tfm;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@@ -125,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
};
/*
@@ -152,6 +158,7 @@ struct crypt_config {
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
+ struct iv_elephant_private elephant;
} iv_gen_private;
u64 iv_offset;
unsigned int iv_size;
@@ -285,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
* The IV is encrypted little-endian byte-offset (with the same key
* and cipher as the volume).
+ *
+ * elephant: The extended version of eboiv with additional Elephant diffuser
+ * used with Bitlocker CBC mode.
+ * This mode was used in older Windows systems
+ * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -331,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
- int log = ilog2(bs);
+ unsigned bs;
+ int log;
+
+ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
+ bs = crypto_aead_blocksize(any_tfm_aead(cc));
+ else
+ bs = crypto_skcipher_blocksize(any_tfm(cc));
+ log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
* to get the cipher block count, we use this shift in _gen */
@@ -717,7 +735,7 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
struct crypto_wait wait;
int err;
- req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS);
+ req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO);
if (!req)
return -ENOMEM;
@@ -734,6 +752,290 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
return err;
}
+static void crypt_iv_elephant_dtr(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+
+ crypto_free_skcipher(elephant->tfm);
+ elephant->tfm = NULL;
+}
+
+static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int r;
+
+ elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+ if (IS_ERR(elephant->tfm)) {
+ r = PTR_ERR(elephant->tfm);
+ elephant->tfm = NULL;
+ return r;
+ }
+
+ r = crypt_iv_eboiv_ctr(cc, ti, NULL);
+ if (r)
+ crypt_iv_elephant_dtr(cc);
+ return r;
+}
+
+static void diffuser_disk_to_cpu(u32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = le32_to_cpu((__le32)d[i]);
+#endif
+}
+
+static void diffuser_cpu_to_disk(__le32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = cpu_to_le32((u32)d[i]);
+#endif
+}
+
+static void diffuser_a_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = 0;
+ i2 = n - 2;
+ i3 = n - 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_a_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = n - 1;
+ i2 = n - 2 - 1;
+ i3 = n - 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static void diffuser_b_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = 0;
+ i2 = 2;
+ i3 = 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_b_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = n - 1;
+ i2 = 2 - 1;
+ i3 = 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 *es, *ks, *data, *data2, *data_offset;
+ struct skcipher_request *req;
+ struct scatterlist *sg, *sg2, src, dst;
+ struct crypto_wait wait;
+ int i, r;
+
+ req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
+ es = kzalloc(16, GFP_NOIO); /* Key for AES */
+ ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
+
+ if (!req || !es || !ks) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+
+ /* E(Ks, e(s)) */
+ sg_init_one(&src, es, 16);
+ sg_init_one(&dst, ks, 16);
+ skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
+ skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ /* E(Ks, e'(s)) */
+ es[15] = 0x80;
+ sg_init_one(&dst, &ks[16], 16);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ data = kmap_atomic(sg_page(sg));
+ data_offset = data + sg->offset;
+
+ /* Cannot modify original bio, copy to sg_out and apply Elephant to it */
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ sg2 = crypt_get_sg_data(cc, dmreq->sg_in);
+ data2 = kmap_atomic(sg_page(sg2));
+ memcpy(data_offset, data2 + sg2->offset, cc->sector_size);
+ kunmap_atomic(data2);
+ }
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ for (i = 0; i < (cc->sector_size / 32); i++)
+ crypto_xor(data_offset + i * 32, ks, 32);
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ kunmap_atomic(data);
+out:
+ kzfree(ks);
+ kzfree(es);
+ skcipher_request_free(req);
+ return r;
+}
+
+static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ r = crypt_iv_elephant(cc, dmreq);
+ if (r)
+ return r;
+ }
+
+ return crypt_iv_eboiv_gen(cc, iv, dmreq);
+}
+
+static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return crypt_iv_elephant(cc, dmreq);
+
+ return 0;
+}
+
+static int crypt_iv_elephant_init(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int key_offset = cc->key_size - cc->key_extra_size;
+
+ return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
+}
+
+static int crypt_iv_elephant_wipe(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 key[ELEPHANT_MAX_KEY_SIZE];
+
+ memset(key, 0, cc->key_extra_size);
+ return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -787,6 +1089,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = {
.generator = crypt_iv_eboiv_gen
};
+static struct crypt_iv_operations crypt_iv_elephant_ops = {
+ .ctr = crypt_iv_elephant_ctr,
+ .dtr = crypt_iv_elephant_dtr,
+ .init = crypt_iv_elephant_init,
+ .wipe = crypt_iv_elephant_wipe,
+ .generator = crypt_iv_elephant_gen,
+ .post = crypt_iv_elephant_post
+};
+
/*
* Integrity extensions
*/
@@ -1103,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
if (r < 0)
return r;
+ /* Data can be already preprocessed in generator */
+ if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags))
+ sg_in = sg_out;
/* Store generated IV in integrity metadata */
if (cc->integrity_iv_size)
memcpy(tag_iv, org_iv, cc->integrity_iv_size);
@@ -2191,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "eboiv") == 0)
cc->iv_gen_ops = &crypt_iv_eboiv_ops;
- else if (strcmp(ivmode, "lmk") == 0) {
+ else if (strcmp(ivmode, "elephant") == 0) {
+ cc->iv_gen_ops = &crypt_iv_elephant_ops;
+ cc->key_parts = 2;
+ cc->key_extra_size = cc->key_size / 2;
+ if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE)
+ return -EINVAL;
+ set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags);
+ } else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
* Version 2 and 3 is recognised according
@@ -2959,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 19, 0},
+ .version = {1, 20, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index eb37584427a4..ff03b90072c5 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
- int ret = DM_MAPIO_REMAPPED;
+ int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
- ret = __dust_map_write(dd, thisblock);
+ r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
- return ret;
+ return r;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e0c32793c248..2bc18c9c3abc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
+
+static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */
struct pgpath {
@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios;
struct bio_list queued_bios;
+
+ struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
};
/*
@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
+static void queue_if_no_path_timeout_work(struct timer_list *t);
/*-----------------------------------------------
* Multipath state flags.
@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti;
ti->private = m;
+
+ timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
}
return m;
@@ -718,6 +727,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
}
/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+ struct multipath *m = from_timer(m, t, nopath_timer);
+ struct mapped_device *md = dm_table_get_md(m->ti->table);
+
+ DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
+ queue_if_no_path(m, false, false);
+}
+
+/*
+ * Enable the queue_if_no_path timeout if necessary.
+ * Called with m->lock held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+ unsigned long queue_if_no_path_timeout =
+ READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
+
+ lockdep_assert_held(&m->lock);
+
+ if (queue_if_no_path_timeout > 0 &&
+ atomic_read(&m->nr_valid_paths) == 0 &&
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ mod_timer(&m->nopath_timer,
+ jiffies + queue_if_no_path_timeout);
+ }
+}
+
+static void disable_nopath_timeout(struct multipath *m)
+{
+ del_timer_sync(&m->nopath_timer);
+}
+
+/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
@@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+ unsigned long flags;
as.argc = argc;
as.argv = argv;
@@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
+
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
@@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
+ disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
}
@@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
+ enable_nopath_timeout(m);
+
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1291,6 +1345,9 @@ out:
process_queued_io_list(m);
}
+ if (pgpath->is_active)
+ disable_nopath_timeout(m);
+
return r;
}
@@ -1444,7 +1501,7 @@ static void pg_init_done(void *data, int errors)
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
- delay_retry = 1;
+ delay_retry = true;
/* fall through */
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
@@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev;
struct multipath *m = ti->private;
action_fn action;
+ unsigned long flags;
mutex_lock(&m->work_mutex);
@@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false);
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false);
+ disable_nopath_timeout(m);
goto out;
}
}
@@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
+module_param_named(queue_if_no_path_timeout_secs,
+ queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
+
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c412eaa975fc..9a18bef0a5ff 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */
@@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" }
};
- /* Must have <raid_type> */
arg = dm_shift_arg(&as);
if (!arg) {
ti->error = "No arguments";
@@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
- unsigned int rebuild_disks;
- unsigned int write_mostly_params = 0;
+ unsigned int rebuild_writemostly_count = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
enum sync_state state;
struct raid_type *rt;
@@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */
- /* Calculate raid parameter count */
- for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
- write_mostly_params += 2;
- rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
- raid_param_cnt += rebuild_disks * 2 +
- write_mostly_params +
+ /*
+ * Count any rebuild or writemostly argument pairs and subtract the
+ * hweight count being added below of any rebuild and writemostly ctr flags.
+ */
+ for (i = 0; i < rs->raid_disks; i++) {
+ rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
+ (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
+ }
+ rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
+ /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
+ raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
- (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
- (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
-
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
/* Emit table line */
/* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
@@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
- if (rebuild_disks)
+ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
- DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
- rs->dev[i].rdev.raid_disk);
+ if (test_bit(i, (void *) rs->rebuild_disks))
+ DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep);
@@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max);
- if (write_mostly_params)
+ if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
@@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 15, 0},
+ .version = {1, 15, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 4fb1a40e68a0..6b11a266299f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: "
"shutting down merge");
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
up_write(&s->lock);
}
goto shut;
@@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut:
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
b = __release_queued_bios_after_merge(s);
up_write(&s->lock);
error_bios(b);
@@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
- s->merge_failed = 0;
+ s->merge_failed = false;
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index b88d6d701f5b..fc9947d6210c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -28,7 +28,7 @@
*
* - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit
- * field holding the time in the low 24 bits, and block in the top 48
+ * field holding the time in the low 24 bits, and block in the top 40
* bits.
*
* BTrees consist solely of btree_nodes, that fill a block. Some are
@@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
* Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit).
*/
-static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
-#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service))
pmd->in_service = true;
}
@@ -811,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r;
if (td->open_count)
- td->changed = 0;
+ td->changed = false;
else {
list_del(&td->list);
kfree(td);
@@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
* We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+ BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service))
return 0;
@@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
+ pmd_write_lock_in_core(pmd);
if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
@@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
}
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
+ pmd_write_unlock(pmd);
kfree(pmd);
return 0;
@@ -1106,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks;
@@ -1618,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
if (inserted)
td->mapped_blocks++;
@@ -1649,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r;
td->mapped_blocks--;
- td->changed = 1;
+ td->changed = true;
return 0;
}
@@ -1703,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
}
td->mapped_blocks -= total_count;
- td->changed = 1;
+ td->changed = true;
/*
* Reinsert the mapping tree.
@@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
* Care is taken to not have commit be what
* triggers putting the thin-pool in-service.
*/
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (pmd->fail_io)
goto out;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 57626c27a54b..fa8d5464c1fb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -231,6 +231,7 @@ struct pool {
struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md;
+ struct block_device *data_dev;
struct block_device *md_dev;
struct dm_pool_metadata *pmd;
@@ -281,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
+
+ struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
@@ -328,7 +331,6 @@ struct pool_c {
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
- struct bio flush_bio;
};
/*
@@ -2924,6 +2926,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
+ bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -2933,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache;
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size,
int read_only, char **error)
{
@@ -3003,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
+ bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -3040,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
+ pool->data_dev = data_dev;
__pool_table_insert(pool);
return pool;
@@ -3081,6 +3087,7 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size, int read_only,
char **error, int *created)
{
@@ -3091,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
*error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY);
}
+ if (pool->data_dev != data_dev) {
+ *error = "data device already in use by a pool";
+ return ERR_PTR(-EBUSY);
+ }
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md);
if (pool) {
- if (pool->md_dev != metadata_dev) {
+ if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
} else {
- pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
+ pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1;
}
}
@@ -3124,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
- bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3203,11 +3213,11 @@ static void metadata_low_callback(void *context)
*/
static int metadata_pre_commit_callback(void *context)
{
- struct pool_c *pt = context;
- struct bio *flush_bio = &pt->flush_bio;
+ struct pool *pool = context;
+ struct bio *flush_bio = &pool->flush_bio;
bio_reset(flush_bio);
- bio_set_dev(flush_bio, pt->data_dev->bdev);
+ bio_set_dev(flush_bio, pool->data_dev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio);
@@ -3356,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
- pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
+ pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
@@ -3381,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
- bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
@@ -3408,9 +3417,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
- dm_pool_register_pre_commit_callback(pt->pool->pmd,
- metadata_pre_commit_callback,
- pt);
+ dm_pool_register_pre_commit_callback(pool->pmd,
+ metadata_pre_commit_callback, pool);
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -4099,7 +4107,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4476,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 4fb33e7562c5..0d61e9c67986 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -611,8 +611,22 @@ no_prefetch_cluster:
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
+ sector_t block = io->block;
+ unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw;
+ if (v->validated_blocks) {
+ while (n_blocks && test_bit(block, v->validated_blocks)) {
+ block++;
+ n_blocks--;
+ }
+ while (n_blocks && test_bit(block + n_blocks - 1,
+ v->validated_blocks))
+ n_blocks--;
+ if (!n_blocks)
+ return;
+ }
+
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v;
- pw->block = io->block;
- pw->n_blocks = io->n_blocks;
+ pw->block = block;
+ pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work);
}
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 7d727a72aa13..b9e27e37a943 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context)
complete(&endio->c);
}
-static void ssd_commit_flushed(struct dm_writecache *wc)
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+ wait_event(wc->bio_in_progress_wait[direction],
+ !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
struct dm_io_region region;
struct dm_io_request req;
@@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c);
+ if (wait_for_ios)
+ writecache_wait_for_ios(wc, WRITE);
+
writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
-static void writecache_commit_flushed(struct dm_writecache *wc)
+static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
wmb();
else
- ssd_commit_flushed(wc);
+ ssd_commit_flushed(wc, wait_for_ios);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
@@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
writecache_error(wc, r, "error flushing metadata: %d", r);
}
-static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
-{
- wait_event(wc->bio_in_progress_wait[direction],
- !atomic_read(&wc->bio_in_progress[direction]));
-}
-
#define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2
@@ -622,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
wc->freelist_size++;
}
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
@@ -631,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
@@ -640,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
@@ -724,15 +731,12 @@ static void writecache_flush(struct dm_writecache *wc)
e = e2;
cond_resched();
}
- writecache_commit_flushed(wc);
-
- if (!WC_MODE_PMEM(wc))
- writecache_wait_for_ios(wc, WRITE);
+ writecache_commit_flushed(wc, true);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc->overwrote_committed = false;
@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc)
}
if (need_flush_after_free)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static void writecache_flush_work(struct work_struct *work)
@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
}
if (discarded_something)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
@@ -958,7 +962,7 @@ erase_this:
if (need_flush) {
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
wc_unlock(wc);
@@ -1193,7 +1197,7 @@ read_next_block:
goto bio_copy;
}
}
- e = writecache_pop_from_freelist(wc);
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
writecache_wait_on_freelist(wc);
continue;
@@ -1205,9 +1209,26 @@ bio_copy:
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ unsigned bio_size = wc->block_size;
+ sector_t start_cache_sec = cache_sector(wc, e);
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+ while (bio_size < bio->bi_iter.bi_size) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ bio_size += wc->block_size;
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+ }
+
bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = cache_sector(wc, e);
+ bio->bi_iter.bi_sector = start_cache_sec;
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
@@ -1342,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
@@ -1423,7 +1444,7 @@ pop_from_list:
writecache_wait_for_ios(wc, READ);
}
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
}
@@ -1766,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
return 0;
}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 22b3cb0050a7..516c7b671d25 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -134,6 +134,7 @@ struct dmz_metadata {
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
+ unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
@@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
/* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
- zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
+ zmd->zone_nr_bitmap_blocks =
+ max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
+ zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
+ DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
@@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk);
- chunk_block += DMZ_BLOCK_SIZE_BITS;
+ chunk_block += zmd->zone_bits_per_mblk;
}
to_zone->weight = from_zone->weight;
@@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) {
@@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits);
@@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
{
struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits;
+ unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap;
int n = 0;
@@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Get offset */
bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zone_bits - bit);
if (set)
- set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_bit(bitmap, zone_bits, bit);
else
- set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk);
n += set_bit - bit;
- if (set_bit < DMZ_BLOCK_SIZE_BITS)
+ if (set_bit < zone_bits)
break;
nr_blocks -= nr_bits;
@@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Count bits in this block */
bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e8f9661a10a1..b89f07ee2eff 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
+ md->queue->backing_dev_info->congested_data = md;
md->queue->backing_dev_info->congested_fn = dm_any_congested;
}
@@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->queue)
goto bad;
md->queue->queuedata = md;
- md->queue->backing_dev_info->congested_data = md;
+ /*
+ * default to bio-based required ->make_request_fn until DM
+ * table is loaded and md->type established. If request-based
+ * table is loaded: blk-mq will override accordingly.
+ */
+ blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
@@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index bd68f6fef694..d8b4125e338c 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return -ENOSPC;
}
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *b)
+{
+ int r;
+ uint32_t count;
+
+ do {
+ r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
+ if (r)
+ break;
+
+ /* double check this block wasn't used in the old transaction */
+ if (*b >= old_ll->nr_blocks)
+ count = 0;
+ else {
+ r = sm_ll_lookup(old_ll, *b, &count);
+ if (r)
+ break;
+
+ if (count)
+ begin = *b + 1;
+ }
+ } while (count);
+
+ return r;
+}
+
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index b3078d5eda0c..8de63ce39bdd 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index 32adf6b4a9c7..bf4c5e2ccb6f 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- /* FIXME: we should loop round a couple of times */
- r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r)
return r;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 25328582cc48..9e3c64ec2026 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r)
return r;