summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorSong Liu <songliubraving@fb.com>2016-11-18 02:24:40 +0300
committerShaohua Li <shli@fb.com>2016-11-19 00:26:48 +0300
commita39f7afde358ca89e9fc09a5525d3f8631a98a3a (patch)
tree25c08dcdb44ee34bc670b54d7e5ca9d1d03275f4 /drivers/md/raid5.c
parent1e6d690b9334b7e1b31d25fd8d93e980e449a5f9 (diff)
downloadlinux-a39f7afde358ca89e9fc09a5525d3f8631a98a3a.tar.xz
md/r5cache: write-out phase and reclaim support
There are two limited resources, stripe cache and journal disk space. For better performance, we priotize reclaim of full stripe writes. To free up more journal space, we free earliest data on the journal. In current implementation, reclaim happens when: 1. Periodically (every R5C_RECLAIM_WAKEUP_INTERVAL, 30 seconds) reclaim if there is no reclaim in the past 5 seconds. 2. when there are R5C_FULL_STRIPE_FLUSH_BATCH (256) cached full stripes, or cached stripes is enough for a full stripe (chunk size / 4k) (r5c_check_cached_full_stripe) 3. when there is pressure on stripe cache (r5c_check_stripe_cache_usage) 4. when there is pressure on journal space (r5l_write_stripe, r5c_cache_data) r5c_do_reclaim() contains new logic of reclaim. For stripe cache: When stripe cache pressure is high (more than 3/4 stripes are cached, or there is empty inactive lists), flush all full stripe. If fewer than R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) full stripes are flushed, flush some paritial stripes. When stripe cache pressure is moderate (1/2 to 3/4 of stripes are cached), flush all full stripes. For log space: To avoid deadlock due to log space, we need to reserve enough space to flush cached data. The size of required log space depends on total number of cached stripes (stripe_in_journal_count). In current implementation, the writing-out phase automatically include pending data writes with parity writes (similar to write through case). Therefore, we need up to (conf->raid_disks + 1) pages for each cached stripe (1 page for meta data, raid_disks pages for all data and parity). r5c_log_required_to_flush_cache() calculates log space required to flush cache. In the following, we refer to the space calculated by r5c_log_required_to_flush_cache() as reclaim_required_space. Two flags are added to r5conf->cache_state: R5C_LOG_TIGHT and R5C_LOG_CRITICAL. R5C_LOG_TIGHT is set when free space on the log device is less than 3x of reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log device is less than 2x of reclaim_required_space. r5c_cache keeps all data in cache (not fully committed to RAID) in a list (stripe_in_journal_list). These stripes are in the order of their first appearance on the journal. So the log tail (last_checkpoint) should point to the journal_start of the first item in the list. When R5C_LOG_TIGHT is set, r5l_reclaim_thread starts flushing out stripes at the head of stripe_in_journal. When R5C_LOG_CRITICAL is set, the state machine only writes data that are already in the log device (in stripe_in_journal_list). This patch includes a fix to improve performance by Shaohua Li <shli@fb.com>. Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c21
1 files changed, 21 insertions, 0 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f535ce2c267a..90638ba67812 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -228,6 +228,16 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
for (i = sh->disks; i--; )
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
+ /*
+ * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+ * data in journal, so they are not released to cached lists
+ */
+ if (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+ r5c_make_stripe_write_out(sh);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
@@ -268,6 +278,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
atomic_dec(&conf->r5c_cached_partial_stripes);
list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+ r5c_check_cached_full_stripe(conf);
} else {
/* partial stripe */
if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
@@ -639,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
}
if (noblock && sh == NULL)
break;
+
+ r5c_check_stripe_cache_usage(conf);
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
+ r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(
conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
@@ -1992,7 +2006,9 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
spin_lock_init(&sh->batch_lock);
INIT_LIST_HEAD(&sh->batch_list);
INIT_LIST_HEAD(&sh->lru);
+ INIT_LIST_HEAD(&sh->r5c);
atomic_set(&sh->count, 1);
+ sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -4759,6 +4775,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return 1;
+
+ /* Also checks whether there is pressure on r5cache log space */
+ if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
+ return 1;
if (conf->quiesce)
return 1;
if (atomic_read(&conf->empty_inactive_list_nr))
@@ -7661,6 +7681,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
*/
+ r5c_flush_cache(conf, INT_MAX);
conf->quiesce = 2;
wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&