summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c634
1 files changed, 384 insertions, 250 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ac251e91e6..06d7279bdd04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES 256
-#define STRIPE_SIZE PAGE_SIZE
-#define STRIPE_SHIFT (PAGE_SHIFT - 9)
-#define STRIPE_SECTORS (STRIPE_SIZE>>9)
-#define IO_THRESHOLD 1
-#define BYPASS_THRESHOLD 1
-#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK (NR_HASH - 1)
-#define MAX_STRIPE_BATCH 8
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
local_irq_enable();
}
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap. There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
- int sectors = bio_sectors(bio);
- if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
- return bio->bi_next;
- else
- return NULL;
-}
-
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
- unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- int old, new;
-
- do {
- old = atomic_read(segments);
- new = (old & 0xffff) | (cnt << 16);
- } while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
- atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
- atomic_set(segments, cnt);
-}
-
/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
{
+ int i;
+ int injournal = 0; /* number of date pages with R5_InJournal */
+
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
+
+ if (r5c_is_writeback(conf->log))
+ for (i = sh->disks; i--; )
+ if (test_bit(R5_InJournal, &sh->dev[i].flags))
+ injournal++;
+ /*
+ * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+ * data in journal, so they are not released to cached lists
+ */
+ if (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+ r5c_make_stripe_write_out(sh);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
atomic_dec(&conf->active_stripes);
- if (!test_bit(STRIPE_EXPANDING, &sh->state))
- list_add_tail(&sh->lru, temp_inactive_list);
+ if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+ if (!r5c_is_writeback(conf->log))
+ list_add_tail(&sh->lru, temp_inactive_list);
+ else {
+ WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
+ if (injournal == 0)
+ list_add_tail(&sh->lru, temp_inactive_list);
+ else if (injournal == conf->raid_disks - conf->max_degraded) {
+ /* full stripe */
+ if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+ atomic_inc(&conf->r5c_cached_full_stripes);
+ if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+ atomic_dec(&conf->r5c_cached_partial_stripes);
+ list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+ r5c_check_cached_full_stripe(conf);
+ } else {
+ /* partial stripe */
+ if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+ &sh->state))
+ atomic_inc(&conf->r5c_cached_partial_stripes);
+ list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+ }
+ }
+ }
}
}
@@ -541,7 +511,7 @@ retry:
if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
- printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
+ pr_err("sector=%llx i=%d %p %p %p %p %d\n",
(unsigned long long)sh->sector, i, dev->toread,
dev->read, dev->towrite, dev->written,
test_bit(R5_LOCKED, &dev->flags));
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
}
if (noblock && sh == NULL)
break;
+
+ r5c_check_stripe_cache_usage(conf);
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
+ r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(
conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ /* writing out phase */
+ if (s->waiting_extra_page)
+ return;
+ if (r5l_write_stripe(conf->log, sh) == 0)
+ return;
+ } else { /* caching phase */
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+ r5c_cache_data(conf->log, sh, s);
+ return;
+ }
+ }
+
for (i = disks; i--; ) {
int op, op_flags = 0;
int replace_only = 0;
@@ -913,7 +897,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
op = REQ_OP_WRITE;
if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
- op_flags = WRITE_FUA;
+ op_flags = REQ_FUA;
if (test_bit(R5_Discard, &sh->dev[i].flags))
op = REQ_OP_DISCARD;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
@@ -977,7 +961,7 @@ again:
if (bad < 0) {
set_bit(BlockedBadBlocks, &rdev->flags);
if (!conf->mddev->external &&
- conf->mddev->flags) {
+ conf->mddev->sb_flags) {
/* It is very unlikely, but we might
* still need to write out the
* bad block log - better give it
@@ -1115,7 +1099,7 @@ again:
static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page **page,
sector_t sector, struct dma_async_tx_descriptor *tx,
- struct stripe_head *sh)
+ struct stripe_head *sh, int no_skipcopy)
{
struct bio_vec bvl;
struct bvec_iter iter;
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
if (frombio) {
if (sh->raid_conf->skip_copy &&
b_offset == 0 && page_offset == 0 &&
- clen == STRIPE_SIZE)
+ clen == STRIPE_SIZE &&
+ !no_skipcopy)
*page = bio_page;
else
tx = async_memcpy(*page, bio_page, page_offset,
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
tx = async_copy_data(0, rbi, &dev->page,
- dev->sector, tx, sh);
+ dev->sector, tx, sh, 0);
rbi = r5_next_bio(rbi, dev->sector);
}
}
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
if (i == sh->qd_idx || i == sh->pd_idx ||
(srctype == SYNDROME_SRC_ALL) ||
(srctype == SYNDROME_SRC_WANT_DRAIN &&
- test_bit(R5_Wantdrain, &dev->flags)) ||
+ (test_bit(R5_Wantdrain, &dev->flags) ||
+ test_bit(R5_InJournal, &dev->flags))) ||
(srctype == SYNDROME_SRC_WRITTEN &&
- dev->written))
- srcs[slot] = sh->dev[i].page;
+ dev->written)) {
+ if (test_bit(R5_InJournal, &dev->flags))
+ srcs[slot] = sh->dev[i].orig_page;
+ else
+ srcs[slot] = sh->dev[i].page;
+ }
i = raid6_next_disk(i, disks);
} while (i != d0_idx);
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+
+ if (r5c_is_writeback(sh->raid_conf->log))
+ /*
+ * raid5-cache write back uses orig_page during prexor.
+ * After prexor, it is time to free orig_page
+ */
+ r5c_release_extra_page(sh);
}
static struct dma_async_tx_descriptor *
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */
- if (test_bit(R5_Wantdrain, &dev->flags))
+ if (test_bit(R5_InJournal, &dev->flags))
+ xor_srcs[count++] = dev->orig_page;
+ else if (test_bit(R5_Wantdrain, &dev->flags))
xor_srcs[count++] = dev->page;
}
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
+ struct r5conf *conf = sh->raid_conf;
int disks = sh->disks;
int i;
struct stripe_head *head_sh = sh;
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
again:
dev = &sh->dev[i];
+ /*
+ * clear R5_InJournal, so when rewriting a page in
+ * journal, it is not skipped by r5l_log_stripe()
+ */
+ clear_bit(R5_InJournal, &dev->flags);
spin_lock_irq(&sh->stripe_lock);
chosen = dev->towrite;
dev->towrite = NULL;
@@ -1637,8 +1642,10 @@ again:
set_bit(R5_Discard, &dev->flags);
else {
tx = async_copy_data(1, wbi, &dev->page,
- dev->sector, tx, sh);
- if (dev->page != dev->orig_page) {
+ dev->sector, tx, sh,
+ r5c_is_writeback(conf->log));
+ if (dev->page != dev->orig_page &&
+ !r5c_is_writeback(conf->log)) {
set_bit(R5_SkipCopy, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags);
clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1746,7 +1753,8 @@ again:
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (head_sh->dev[i].written)
+ if (head_sh->dev[i].written ||
+ test_bit(R5_InJournal, &head_sh->dev[i].flags))
xor_srcs[count++] = dev->page;
}
} else {
@@ -2000,17 +2008,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
spin_lock_init(&sh->batch_lock);
INIT_LIST_HEAD(&sh->batch_list);
INIT_LIST_HEAD(&sh->lru);
+ INIT_LIST_HEAD(&sh->r5c);
+ INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
+ sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
- bio_init(&dev->req);
- dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_max_vecs = 1;
-
- bio_init(&dev->rreq);
- dev->rreq.bi_io_vec = &dev->rvec;
- dev->rreq.bi_max_vecs = 1;
+ bio_init(&dev->req, &dev->vec, 1);
+ bio_init(&dev->rreq, &dev->rvec, 1);
}
}
return sh;
@@ -2245,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
*/
ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
if (ndisks) {
- for (i=0; i<conf->raid_disks; i++)
+ for (i = 0; i < conf->pool_size; i++)
ndisks[i] = conf->disks[i];
- kfree(conf->disks);
- conf->disks = ndisks;
+
+ for (i = conf->pool_size; i < newsize; i++) {
+ ndisks[i].extra_page = alloc_page(GFP_NOIO);
+ if (!ndisks[i].extra_page)
+ err = -ENOMEM;
+ }
+
+ if (err) {
+ for (i = conf->pool_size; i < newsize; i++)
+ if (ndisks[i].extra_page)
+ put_page(ndisks[i].extra_page);
+ kfree(ndisks);
+ } else {
+ kfree(conf->disks);
+ conf->disks = ndisks;
+ }
} else
err = -ENOMEM;
@@ -2347,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
* replacement device. We just fail those on
* any error
*/
- printk_ratelimited(
- KERN_INFO
- "md/raid:%s: read error corrected"
- " (%lu sectors at %llu on %s)\n",
+ pr_info_ratelimited(
+ "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS,
(unsigned long long)s,
bdevname(rdev->bdev, b));
@@ -2370,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
- printk_ratelimited(
- KERN_WARNING
- "md/raid:%s: read error on replacement device "
- "(sector %llu on %s).\n",
+ pr_warn_ratelimited(
+ "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)s,
bdn);
else if (conf->mddev->degraded >= conf->max_degraded) {
set_bad = 1;
- printk_ratelimited(
- KERN_WARNING
- "md/raid:%s: read error not correctable "
- "(sector %llu on %s).\n",
+ pr_warn_ratelimited(
+ "md/raid:%s: read error not correctable (sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)s,
bdn);
} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
/* Oh, no!!! */
set_bad = 1;
- printk_ratelimited(
- KERN_WARNING
- "md/raid:%s: read error NOT corrected!! "
- "(sector %llu on %s).\n",
+ pr_warn_ratelimited(
+ "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)s,
bdn);
} else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
- printk(KERN_WARNING
- "md/raid:%s: Too many read errors, failing device %s.\n",
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
mdname(conf->mddev), bdn);
else
retry = 1;
@@ -2531,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- set_mask_bits(&mddev->flags, 0,
- BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
- printk(KERN_ALERT
- "md/raid:%s: Disk failure on %s, disabling device.\n"
- "md/raid:%s: Operation continuing on %d devices.\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- mdname(mddev),
- conf->raid_disks - mddev->degraded);
+ set_mask_bits(&mddev->sb_flags, 0,
+ BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
+ "md/raid:%s: Operation continuing on %d devices.\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ mdname(mddev),
+ conf->raid_disks - mddev->degraded);
}
/*
@@ -2861,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
previous, &dummy1, &sh2);
if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
|| sh2.qd_idx != sh->qd_idx) {
- printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
- mdname(conf->mddev));
+ pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
+ mdname(conf->mddev));
return 0;
}
return r_sector;
@@ -2877,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int level = conf->level;
if (rcw) {
+ /*
+ * In some cases, handle_stripe_dirtying initially decided to
+ * run rmw and allocates extra page for prexor. However, rcw is
+ * cheaper later on. We need to free the extra page now,
+ * because we won't be able to do that in ops_complete_prexor().
+ */
+ r5c_release_extra_page(sh);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2887,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
if (!expand)
clear_bit(R5_UPTODATE, &dev->flags);
s->locked++;
+ } else if (test_bit(R5_InJournal, &dev->flags)) {
+ set_bit(R5_LOCKED, &dev->flags);
+ s->locked++;
}
}
/* if we are not expanding this is a proper write request, and
@@ -2926,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
set_bit(R5_LOCKED, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags);
s->locked++;
+ } else if (test_bit(R5_InJournal, &dev->flags)) {
+ set_bit(R5_LOCKED, &dev->flags);
+ s->locked++;
}
}
if (!s->locked)
@@ -3569,10 +3592,10 @@ unhash:
break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
}
-static void handle_stripe_dirtying(struct r5conf *conf,
- struct stripe_head *sh,
- struct stripe_head_state *s,
- int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+ struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
{
int rmw = 0, rcw = 0, i;
sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3597,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+ if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+ test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
- !(test_bit(R5_UPTODATE, &dev->flags) ||
+ !((test_bit(R5_UPTODATE, &dev->flags) &&
+ (!test_bit(R5_InJournal, &dev->flags) ||
+ dev->page != dev->orig_page)) ||
test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rmw++;
@@ -3611,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Wantcompute, &dev->flags))) {
+ test_bit(R5_InJournal, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rcw++;
else
rcw += 2*disks;
}
}
+
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -3629,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf,
(unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+ if (test_bit(R5_InJournal, &dev->flags) &&
+ dev->page == dev->orig_page &&
+ !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
+ /* alloc page for prexor */
+ struct page *p = alloc_page(GFP_NOIO);
+
+ if (p) {
+ dev->orig_page = p;
+ continue;
+ }
+
+ /*
+ * alloc_page() failed, try use
+ * disk_info->extra_page
+ */
+ if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+ &conf->cache_state)) {
+ r5c_use_extra_page(sh);
+ break;
+ }
+
+ /* extra_page in use, add to delayed_list */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ s->waiting_extra_page = 1;
+ return -EAGAIN;
+ }
+ }
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if ((dev->towrite ||
+ i == sh->pd_idx || i == sh->qd_idx ||
+ test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
- !(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Wantcompute, &dev->flags)) &&
+ !((test_bit(R5_UPTODATE, &dev->flags) &&
+ (!test_bit(R5_InJournal, &dev->flags) ||
+ dev->page != dev->orig_page)) ||
+ test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE,
&sh->state)) {
@@ -3658,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_InJournal, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
if (test_bit(R5_Insync, &dev->flags) &&
@@ -3697,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
*/
if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
(s->locked == 0 && (rcw == 0 || rmw == 0) &&
- !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)))
schedule_reconstruction(sh, s, rcw == 0, 0);
+ return 0;
}
static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -3782,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
case check_state_compute_run:
break;
default:
- printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+ pr_err("%s: unknown check_state: %d sector: %llu\n",
__func__, sh->check_state,
(unsigned long long) sh->sector);
BUG();
@@ -3946,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
case check_state_compute_run:
break;
default:
- printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
- __func__, sh->check_state,
- (unsigned long long) sh->sector);
+ pr_warn("%s: unknown check_state: %d sector: %llu\n",
+ __func__, sh->check_state,
+ (unsigned long long) sh->sector);
BUG();
}
}
@@ -4188,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
}
+
+ if (test_bit(R5_InJournal, &dev->flags))
+ s->injournal++;
+ if (test_bit(R5_InJournal, &dev->flags) && dev->written)
+ s->just_cached++;
}
if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced,
@@ -4416,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
(i == sh->pd_idx || i == sh->qd_idx ||
- dev->written)) {
+ dev->written || test_bit(R5_InJournal,
+ &dev->flags))) {
pr_debug("Writing block %d\n", i);
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
@@ -4456,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+ if (s.just_cached)
+ r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+ r5l_stripe_write_finished(sh);
+
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
@@ -4467,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
|| s.expanding)
handle_stripe_fill(sh, &s, disks);
- /* Now to consider new write requests and what else, if anything
- * should be read. We do not handle new writes when:
+ /*
+ * When the stripe finishes full journal write cycle (write to journal
+ * and raid disk), this is the clean up procedure so it is ready for
+ * next operation.
+ */
+ r5c_finish_stripe_write_out(conf, sh, &s);
+
+ /*
+ * Now to consider new write requests, cache write back and what else,
+ * if anything should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight.
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
+ * 3/ A r5c cache log write is in flight.
*/
- if (s.to_write && !sh->reconstruct_state && !sh->check_state)
- handle_stripe_dirtying(conf, sh, &s, disks);
+
+ if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
+ if (!r5c_is_writeback(conf->log)) {
+ if (s.to_write)
+ handle_stripe_dirtying(conf, sh, &s, disks);
+ } else { /* write back cache */
+ int ret = 0;
+
+ /* First, try handle writes in caching phase */
+ if (s.to_write)
+ ret = r5c_try_caching_write(conf, sh, &s,
+ disks);
+ /*
+ * If caching phase failed: ret == -EAGAIN
+ * OR
+ * stripe under reclaim: !caching && injournal
+ *
+ * fall back to handle_stripe_dirtying()
+ */
+ if (ret == -EAGAIN ||
+ /* stripe under reclaim: !caching && injournal */
+ (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
+ s.injournal > 0)) {
+ ret = handle_stripe_dirtying(conf, sh, &s,
+ disks);
+ if (ret == -EAGAIN)
+ goto finish;
+ }
+ }
+ }
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
@@ -4645,9 +4756,7 @@ finish:
}
if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
- (s.failed <= conf->max_degraded ||
- conf->mddev->external == 0)) {
+ if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->return_bi, &s.return_bi);
spin_unlock_irq(&conf->device_lock);
@@ -4703,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return 1;
+
+ /* Also checks whether there is pressure on r5cache log space */
+ if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
+ return 1;
if (conf->quiesce)
return 1;
if (atomic_read(&conf->empty_inactive_list_nr))
@@ -5172,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int remaining;
DEFINE_WAIT(w);
bool do_prepare;
+ bool do_flush = false;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5183,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
return;
}
/* ret == -EAGAIN, fallback */
+ /*
+ * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+ * we need to flush journal device
+ */
+ do_flush = bi->bi_opf & REQ_PREFLUSH;
}
md_write_start(mddev, bi);
@@ -5193,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
+ !r5c_is_writeback(conf->log) &&
mddev->reshape_position == MaxSector) {
bi = chunk_aligned_read(mddev, bi);
if (!bi)
@@ -5321,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_prepare = true;
goto retry;
}
+ if (do_flush) {
+ set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+ /* we only need flush for one stripe */
+ do_flush = false;
+ }
+
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
@@ -5486,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
- wait_event(mddev->sb_wait, mddev->flags == 0 ||
+ wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
return 0;
@@ -5584,10 +5710,10 @@ finish:
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
- !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+ !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto ret;
@@ -5862,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
if (!bio_list_empty(&conf->return_bi) &&
- !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
struct bio_list tmp = BIO_EMPTY_LIST;
spin_lock_irq(&conf->device_lock);
- if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
bio_list_merge(&tmp, &conf->return_bi);
bio_list_init(&conf->return_bi);
}
@@ -5912,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
break;
handled += batch_size;
- if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+ if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
@@ -6242,6 +6368,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
+ &r5c_journal_mode.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
@@ -6368,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
+ int i;
+
if (conf->log)
r5l_exit_log(conf->log);
if (conf->shrinker.nr_deferred)
@@ -6376,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
+ for (i = 0; i < conf->pool_size; i++)
+ if (conf->disks[i].extra_page)
+ put_page(conf->disks[i].extra_page);
kfree(conf->disks);
kfree(conf->stripe_hashtbl);
kfree(conf);
@@ -6387,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
if (alloc_scratch_buffer(conf, percpu)) {
- pr_err("%s: failed memory allocation for cpu%u\n",
- __func__, cpu);
+ pr_warn("%s: failed memory allocation for cpu%u\n",
+ __func__, cpu);
return -ENOMEM;
}
return 0;
@@ -6458,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (mddev->new_level != 5
&& mddev->new_level != 4
&& mddev->new_level != 6) {
- printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
- mdname(mddev), mddev->new_level);
+ pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
+ mdname(mddev), mddev->new_level);
return ERR_PTR(-EIO);
}
if ((mddev->new_level == 5
&& !algorithm_valid_raid5(mddev->new_layout)) ||
(mddev->new_level == 6
&& !algorithm_valid_raid6(mddev->new_layout))) {
- printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
- mdname(mddev), mddev->new_layout);
+ pr_warn("md/raid:%s: layout %d not supported\n",
+ mdname(mddev), mddev->new_layout);
return ERR_PTR(-EIO);
}
if (mddev->new_level == 6 && mddev->raid_disks < 4) {
- printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
- mdname(mddev), mddev->raid_disks);
+ pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
+ mdname(mddev), mddev->raid_disks);
return ERR_PTR(-EINVAL);
}
if (!mddev->new_chunk_sectors ||
(mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
!is_power_of_2(mddev->new_chunk_sectors)) {
- printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
- mdname(mddev), mddev->new_chunk_sectors << 9);
+ pr_warn("md/raid:%s: invalid chunk size %d\n",
+ mdname(mddev), mddev->new_chunk_sectors << 9);
return ERR_PTR(-EINVAL);
}
@@ -6522,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
GFP_KERNEL);
+
if (!conf->disks)
goto abort;
+ for (i = 0; i < max_disks; i++) {
+ conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+ if (!conf->disks[i].extra_page)
+ goto abort;
+ }
+
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6545,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
INIT_LIST_HEAD(conf->temp_inactive_list + i);
+ atomic_set(&conf->r5c_cached_full_stripes, 0);
+ INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+ atomic_set(&conf->r5c_cached_partial_stripes, 0);
+ INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
if (raid5_alloc_percpu(conf) != 0)
@@ -6571,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(In_sync, &rdev->flags)) {
char b[BDEVNAME_SIZE];
- printk(KERN_INFO "md/raid:%s: device %s operational as raid"
- " disk %d\n",
- mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+ pr_info("md/raid:%s: device %s operational as raid disk %d\n",
+ mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
} else if (rdev->saved_raid_disk != raid_disk)
/* Cannot rely on bitmap to complete recovery */
conf->fullsync = 1;
@@ -6607,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
conf->min_nr_stripes = max(NR_STRIPES, stripes);
if (conf->min_nr_stripes != NR_STRIPES)
- printk(KERN_INFO
- "md/raid:%s: force stripe size %d for reshape\n",
+ pr_info("md/raid:%s: force stripe size %d for reshape\n",
mdname(mddev), conf->min_nr_stripes);
}
memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
if (grow_stripes(conf, conf->min_nr_stripes)) {
- printk(KERN_ERR
- "md/raid:%s: couldn't allocate %dkB for buffers\n",
- mdname(mddev), memory);
+ pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
+ mdname(mddev), memory);
goto abort;
} else
- printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
- mdname(mddev), memory);
+ pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
/*
* Losing a stripe head costs more than the time to refill it,
* it reduces the queue depth and so can hurt throughput.
@@ -6633,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
if (register_shrinker(&conf->shrinker)) {
- printk(KERN_ERR
- "md/raid:%s: couldn't register shrinker.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: couldn't register shrinker.\n",
+ mdname(mddev));
goto abort;
}
sprintf(pers_name, "raid%d", mddev->new_level);
conf->thread = md_register_thread(raid5d, mddev, pers_name);
if (!conf->thread) {
- printk(KERN_ERR
- "md/raid:%s: couldn't allocate thread.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: couldn't allocate thread.\n",
+ mdname(mddev));
goto abort;
}
@@ -6697,9 +6835,8 @@ static int raid5_run(struct mddev *mddev)
int first = 1;
if (mddev->recovery_cp != MaxSector)
- printk(KERN_NOTICE "md/raid:%s: not clean"
- " -- starting background reconstruction\n",
- mdname(mddev));
+ pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
+ mdname(mddev));
rdev_for_each(rdev, mddev) {
long long diff;
@@ -6742,15 +6879,14 @@ static int raid5_run(struct mddev *mddev)
int new_data_disks;
if (journal_dev) {
- printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
+ mdname(mddev));
return -EINVAL;
}
if (mddev->new_level != mddev->level) {
- printk(KERN_ERR "md/raid:%s: unsupported reshape "
- "required - aborting.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
+ mdname(mddev));
return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
@@ -6765,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
new_data_disks = mddev->raid_disks - max_degraded;
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
- printk(KERN_ERR "md/raid:%s: reshape_position not "
- "on a stripe boundary\n", mdname(mddev));
+ pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
+ mdname(mddev));
return -EINVAL;
}
reshape_offset = here_new * chunk_sectors;
@@ -6787,10 +6923,8 @@ static int raid5_run(struct mddev *mddev)
abs(min_offset_diff) >= mddev->new_chunk_sectors)
/* not really in-place - so OK */;
else if (mddev->ro == 0) {
- printk(KERN_ERR "md/raid:%s: in-place reshape "
- "must be started in read-only mode "
- "- aborting\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
+ mdname(mddev));
return -EINVAL;
}
} else if (mddev->reshape_backwards
@@ -6799,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
: (here_new * chunk_sectors >=
here_old * chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
- printk(KERN_ERR "md/raid:%s: reshape_position too early for "
- "auto-recovery - aborting.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
+ mdname(mddev));
return -EINVAL;
}
- printk(KERN_INFO "md/raid:%s: reshape will continue\n",
- mdname(mddev));
+ pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */
} else {
BUG_ON(mddev->level != mddev->new_level);
@@ -6824,8 +6956,8 @@ static int raid5_run(struct mddev *mddev)
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) {
- pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
+ mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
} else if (mddev->recovery_cp == MaxSector)
@@ -6852,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
if (conf->disks[i].replacement &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
- printk(KERN_ERR "md: cannot handle concurrent "
- "replacement and reshape.\n");
+ pr_warn("md: cannot handle concurrent replacement and reshape.\n");
goto abort;
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -6895,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
mddev->degraded = calc_degraded(conf);
if (has_failed(conf)) {
- printk(KERN_ERR "md/raid:%s: not enough operational devices"
- " (%d/%d failed)\n",
+ pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
@@ -6908,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
if (mddev->ok_start_degraded)
- printk(KERN_WARNING
- "md/raid:%s: starting dirty degraded array"
- " - data corruption possible.\n",
- mdname(mddev));
+ pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
+ mdname(mddev));
else {
- printk(KERN_ERR
- "md/raid:%s: cannot start dirty degraded array.\n",
- mdname(mddev));
+ pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
+ mdname(mddev));
goto abort;
}
}
- if (mddev->degraded == 0)
- printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
- " devices, algorithm %d\n", mdname(mddev), conf->level,
- mddev->raid_disks-mddev->degraded, mddev->raid_disks,
- mddev->new_layout);
- else
- printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
- " out of %d devices, algorithm %d\n",
- mdname(mddev), conf->level,
- mddev->raid_disks - mddev->degraded,
- mddev->raid_disks, mddev->new_layout);
+ pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
+ mdname(mddev), conf->level,
+ mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+ mddev->new_layout);
print_raid5_conf(conf);
@@ -6950,9 +7070,8 @@ static int raid5_run(struct mddev *mddev)
mddev->to_remove = NULL;
else if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
- printk(KERN_WARNING
- "raid5: failed to create sysfs attributes for %s\n",
- mdname(mddev));
+ pr_warn("raid5: failed to create sysfs attributes for %s\n",
+ mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) {
@@ -6984,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
+
+ /*
+ * We use 16-bit counter of active stripes in bi_phys_segments
+ * (minus one for over-loaded initialization)
+ */
+ blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+ blk_queue_max_discard_sectors(mddev->queue,
+ 0xfffe * STRIPE_SECTORS);
+
/*
* unaligned part of discard request will be ignored, so can't
* guarantee discard_zeroes_data
@@ -7040,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
if (journal_dev) {
char b[BDEVNAME_SIZE];
- printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- r5l_init_log(conf, journal_dev);
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(mddev), bdevname(journal_dev->bdev, b));
+ if (r5l_init_log(conf, journal_dev))
+ goto abort;
}
return 0;
@@ -7051,7 +7180,7 @@ abort:
print_raid5_conf(conf);
free_conf(conf);
mddev->private = NULL;
- printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
+ pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
return -EIO;
}
@@ -7085,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
int i;
struct disk_info *tmp;
- printk(KERN_DEBUG "RAID conf printout:\n");
+ pr_debug("RAID conf printout:\n");
if (!conf) {
- printk("(conf==NULL)\n");
+ pr_debug("(conf==NULL)\n");
return;
}
- printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+ pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
@@ -7098,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
char b[BDEVNAME_SIZE];
tmp = conf->disks + i;
if (tmp->rdev)
- printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+ pr_debug(" disk %d, o:%d, dev:%s\n",
i, !test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev, b));
}
@@ -7246,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* write requests running. We should be safe
*/
r5l_init_log(conf, rdev);
- printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(mddev), bdevname(rdev->bdev, b));
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7351,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev)
> conf->min_nr_stripes ||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
> conf->min_nr_stripes) {
- printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
- mdname(mddev),
- ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
- / STRIPE_SIZE)*4);
+ pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
+ mdname(mddev),
+ ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+ / STRIPE_SIZE)*4);
return 0;
}
return 1;
@@ -7435,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
*/
if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
< mddev->array_sectors) {
- printk(KERN_ERR "md/raid:%s: array size must be reduced "
- "before number of disks\n", mdname(mddev));
+ pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
+ mdname(mddev));
return -EINVAL;
}
@@ -7506,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
}
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = conf->reshape_progress;
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -7624,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
*/
+ r5c_flush_cache(conf, INT_MAX);
conf->quiesce = 2;
wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
@@ -7654,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
/* for raid0 takeover only one zone is supported */
if (raid0_conf->nr_strip_zones > 1) {
- printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
- mdname(mddev));
+ pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+ mdname(mddev));
return ERR_PTR(-EINVAL);
}
@@ -7676,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
static void *raid5_takeover_raid1(struct mddev *mddev)
{
int chunksect;
+ void *ret;
if (mddev->raid_disks != 2 ||
mddev->degraded > 1)
@@ -7697,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
mddev->new_chunk_sectors = chunksect;
- return setup_conf(mddev);
+ ret = setup_conf(mddev);
+ if (!IS_ERR_VALUE(ret))
+ clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ return ret;
}
static void *raid5_takeover_raid6(struct mddev *mddev)
@@ -7767,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
conf->chunk_sectors = new_chunk ;
mddev->chunk_sectors = new_chunk;
}
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
}
return check_reshape(mddev);