summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorArtur Paszkiewicz <artur.paszkiewicz@intel.com>2017-03-09 11:59:59 +0300
committerShaohua Li <shli@fb.com>2017-03-17 02:55:54 +0300
commit3418d036c81dcb604b7c7c71b209d5890a8418aa (patch)
treed02a31103e09f82858bf149ebcb511e12ed6065a /drivers/md/raid5.c
parentff875738edd44e3bc892d378deacc50bccc9d70c (diff)
downloadlinux-3418d036c81dcb604b7c7c71b209d5890a8418aa.tar.xz
raid5-ppl: Partial Parity Log write logging implementation
Implement the calculation of partial parity for a stripe and PPL write logging functionality. The description of PPL is added to the documentation. More details can be found in the comments in raid5-ppl.c. Attach a page for holding the partial parity data to stripe_head. Allocate it only if mddev has the MD_HAS_PPL flag set. Partial parity is the xor of not modified data chunks of a stripe and is calculated as follows: - reconstruct-write case: xor data from all not updated disks in a stripe - read-modify-write case: xor old data and parity from all updated disks in a stripe Implement it using the async_tx API and integrate into raid_run_ops(). It must be called when we still have access to old data, so do it when STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is stored into sh->ppl_page. Partial parity is not meaningful for full stripe write and is not stored in the log or used for recovery, so don't attempt to calculate it when stripe has STRIPE_FULL_WRITE. Put the PPL metadata structures to md_p.h because userspace tools (mdadm) will also need to read/write PPL. Warn about using PPL with enabled disk volatile write-back cache for now. It can be removed once disk cache flushing before writing PPL is implemented. Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c64
1 files changed, 61 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f575f40d2acb..6b86e0826afe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+
+ if (sh->ppl_page) {
+ put_page(sh->ppl_page);
+ sh->ppl_page = NULL;
+ }
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
+ if (raid5_has_ppl(sh->raid_conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page)
+ return 1;
+ }
+
return 0;
}
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&