diff options
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r-- | block/blk-throttle.c | 985 |
1 files changed, 910 insertions, 75 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8fab716e4059..b78db2e5fdff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8; /* Total max dispatch from all groups in one round */ static int throtl_quantum = 32; -/* Throttling is performed over 100ms slice and after that slice is renewed */ -static unsigned long throtl_slice = HZ/10; /* 100 ms */ +/* Throttling is performed over a slice and after that slice is renewed */ +#define DFL_THROTL_SLICE_HD (HZ / 10) +#define DFL_THROTL_SLICE_SSD (HZ / 50) +#define MAX_THROTL_SLICE (HZ) +#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ +#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ +#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ +/* default latency target is 0, eg, guarantee IO latency by default */ +#define DFL_LATENCY_TARGET (0) + +#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) static struct blkcg_policy blkcg_policy_throtl; @@ -83,6 +92,12 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -119,20 +134,54 @@ struct throtl_grp { /* are there any throtl rules between this group and td? */ bool has_rules[2]; - /* bytes per second rate limits */ - uint64_t bps[2]; + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; - /* IOPS limits */ - unsigned int iops[2]; + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; +}; + +/* We measure latency for request size from <= 4k to >= 1M */ +#define LATENCY_BUCKET_SIZE 9 + +struct latency_bucket { + unsigned long total_latency; /* ns / 1024 */ + int samples; +}; + +struct avg_latency_bucket { + unsigned long latency; /* ns / 1024 */ + bool valid; }; struct throtl_data @@ -145,8 +194,26 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; + unsigned int throtl_slice; + /* Work for dispatching throttled bios */ struct work_struct dispatch_work; + unsigned int limit_index; + bool limit_valid[LIMIT_CNT]; + + unsigned long dft_idletime_threshold; /* us */ + + unsigned long low_upgrade_time; + unsigned long low_downgrade_time; + + unsigned int scale; + + struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets; + unsigned long last_calculate_time; + + bool track_bio_latency; }; static void throtl_pending_timer_fn(unsigned long arg); @@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } +/* + * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to + * make the IO dispatch more smooth. + * Scale up: linearly scale up according to lapsed time since upgrade. For + * every throtl_slice, the limit scales up 1/2 .low limit till the + * limit hits .max limit + * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit + */ +static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) +{ + /* arbitrary value to avoid too big scale */ + if (td->scale < 4096 && time_after_eq(jiffies, + td->low_upgrade_time + td->scale * td->throtl_slice)) + td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; + + return low + (low >> 1) * td->scale; +} + +static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + uint64_t ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return U64_MAX; + + td = tg->td; + ret = tg->bps[rw][td->limit_index]; + if (ret == 0 && td->limit_index == LIMIT_LOW) + return tg->bps[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && + tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); + ret = min(tg->bps[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + unsigned int ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return UINT_MAX; + td = tg->td; + ret = tg->iops[rw][td->limit_index]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) + return tg->iops[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && + tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); + if (adjusted > UINT_MAX) + adjusted = UINT_MAX; + ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +#define request_bucket_index(sectors) \ + clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) + /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported @@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; + tg->bps[READ][LIMIT_MAX] = U64_MAX; + tg->bps[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops[READ][LIMIT_MAX] = UINT_MAX; + tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; + tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; + tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; + tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; + /* LIMIT_LOW will have default value 0 */ + + tg->latency_target = DFL_LATENCY_TARGET; return &tg->pd; } @@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; + + tg->idletime_threshold = td->dft_idletime_threshold; } /* @@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); + struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || - (tg->bps[rw] != -1 || tg->iops[rw] != -1); + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX || + tg_iops_limit(tg, rw) != UINT_MAX)); } static void throtl_pd_online(struct blkg_policy_data *pd) { + struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(pd_to_tg(pd)); + tg_update_has_rules(tg); +} + +static void blk_throtl_update_limit_valid(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + bool low_valid = false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + low_valid = true; + } + rcu_read_unlock(); + + td->limit_valid[LIMIT_LOW] = low_valid; +} + +static void throtl_upgrade_state(struct throtl_data *td); +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + + blk_throtl_update_limit_valid(tg->td); + + if (!tg->td->limit_valid[tg->td->limit_index]) + throtl_upgrade_state(tg->td); } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { + unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; + + /* + * Since we are adjusting the throttle limit dynamically, the sleep + * time calculated according to previous limit might be invalid. It's + * possible the cgroup sleep time is very long and no other cgroups + * have IO running so notify the limit changes. Make sure the cgroup + * doesn't sleep too long to avoid the missed notification. + */ + if (time_after(expires, max_expire)) + expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); @@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, if (time_after_eq(start, tg->slice_start[rw])) tg->slice_start[rw] = start; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = jiffies - tg->slice_start[rw]; - nr_slices = time_elapsed / throtl_slice; + nr_slices = time_elapsed / tg->td->throtl_slice; if (!nr_slices) return; - tmp = tg->bps[rw] * throtl_slice * nr_slices; + tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; do_div(tmp, HZ); bytes_trim = tmp; - io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / + HZ; if (!bytes_trim && !io_trim) return; @@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) else tg->io_disp[rw] = 0; - tg->slice_start[rw] += nr_slices * throtl_slice; + tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", @@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be @@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ - tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; if (jiffy_wait > jiffy_elapsed) jiffy_wait = jiffy_wait - jiffy_elapsed; @@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - tmp = tg->bps[rw] * jiffy_elapsed_rnd; + tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); bytes_allowed = tmp; @@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); if (!jiffy_wait) jiffy_wait = 1; @@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (tg_bps_limit(tg, rw) == U64_MAX && + tg_iops_limit(tg, rw) == UINT_MAX) { if (wait) *wait = 0; return true; @@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw); else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(tg, rw, jiffies + throtl_slice); + if (time_before(tg->slice_end[rw], + jiffies + tg->td->throtl_slice)) + throtl_extend_slice(tg, rw, + jiffies + tg->td->throtl_slice); } if (tg_with_in_bps_limit(tg, bio, &bps_wait) && @@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_iter.bi_size; tg->io_disp[rw]++; + tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; + tg->last_io_disp[rw]++; /* * BIO_THROTTLED is used to prevent the same bio to be throttled @@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @arg: the throtl_service_queue being serviced @@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg) int ret; spin_lock_irq(q->queue_lock); + if (throtl_can_upgrade(td, NULL)) + throtl_upgrade_state(td); + again: parent_sq = sq->parent_sq; dispatched = false; @@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); - if (v == -1) + if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); - if (v == -1) + if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg) throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", - tg->bps[READ], tg->bps[WRITE], - tg->iops[READ], tg->iops[WRITE]); + tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), + tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); /* * Update has_rules[] flags for the updated tg's subtree. A tg is @@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) - v = -1; + v = U64_MAX; tg = blkg_to_tg(ctx.blkg); @@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ]), + .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE]), + .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ]), + .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE]), + .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = { { } /* terminate */ }; -static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, +static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); char bufs[4][21] = { "max", "max", "max", "max" }; + u64 bps_dft; + unsigned int iops_dft; + char idle_time[26] = ""; + char latency_time[26] = ""; if (!dname) return 0; - if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && - tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + + if (off == LIMIT_LOW) { + bps_dft = 0; + iops_dft = 0; + } else { + bps_dft = U64_MAX; + iops_dft = UINT_MAX; + } + + if (tg->bps_conf[READ][off] == bps_dft && + tg->bps_conf[WRITE][off] == bps_dft && + tg->iops_conf[READ][off] == iops_dft && + tg->iops_conf[WRITE][off] == iops_dft && + (off != LIMIT_LOW || + (tg->idletime_threshold == tg->td->dft_idletime_threshold && + tg->latency_target == DFL_LATENCY_TARGET))) return 0; - if (tg->bps[READ] != -1) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); - if (tg->bps[WRITE] != -1) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); - if (tg->iops[READ] != -1) - snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); - if (tg->iops[WRITE] != -1) - snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); - - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3]); + if (tg->bps_conf[READ][off] != bps_dft) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tg->bps_conf[READ][off]); + if (tg->bps_conf[WRITE][off] != bps_dft) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tg->bps_conf[WRITE][off]); + if (tg->iops_conf[READ][off] != iops_dft) + snprintf(bufs[2], sizeof(bufs[2]), "%u", + tg->iops_conf[READ][off]); + if (tg->iops_conf[WRITE][off] != iops_dft) + snprintf(bufs[3], sizeof(bufs[3]), "%u", + tg->iops_conf[WRITE][off]); + if (off == LIMIT_LOW) { + if (tg->idletime_threshold == ULONG_MAX) + strcpy(idle_time, " idle=max"); + else + snprintf(idle_time, sizeof(idle_time), " idle=%lu", + tg->idletime_threshold); + + if (tg->latency_target == ULONG_MAX) + strcpy(latency_time, " latency=max"); + else + snprintf(latency_time, sizeof(latency_time), + " latency=%lu", tg->latency_target); + } + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, + latency_time); return 0; } -static int tg_print_max(struct seq_file *sf, void *v) +static int tg_print_limit(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static ssize_t tg_set_max(struct kernfs_open_file *of, +static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + unsigned long idle_time; + unsigned long latency_time; int ret; + int index = of_cft(of)->private; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); - v[0] = tg->bps[READ]; - v[1] = tg->bps[WRITE]; - v[2] = tg->iops[READ]; - v[3] = tg->iops[WRITE]; + v[0] = tg->bps_conf[READ][index]; + v[1] = tg->bps_conf[WRITE][index]; + v[2] = tg->iops_conf[READ][index]; + v[3] = tg->iops_conf[WRITE][index]; + idle_time = tg->idletime_threshold; + latency_time = tg->latency_target; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; - u64 val = -1; + u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) @@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); + else if (off == LIMIT_LOW && !strcmp(tok, "idle")) + idle_time = val; + else if (off == LIMIT_LOW && !strcmp(tok, "latency")) + latency_time = val; else goto out_finish; } - tg->bps[READ] = v[0]; - tg->bps[WRITE] = v[1]; - tg->iops[READ] = v[2]; - tg->iops[WRITE] = v[3]; + tg->bps_conf[READ][index] = v[0]; + tg->bps_conf[WRITE][index] = v[1]; + tg->iops_conf[READ][index] = v[2]; + tg->iops_conf[WRITE][index] = v[3]; + if (index == LIMIT_MAX) { + tg->bps[READ][index] = v[0]; + tg->bps[WRITE][index] = v[1]; + tg->iops[READ][index] = v[2]; + tg->iops[WRITE][index] = v[3]; + } + tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], + tg->bps_conf[READ][LIMIT_MAX]); + tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], + tg->bps_conf[WRITE][LIMIT_MAX]); + tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], + tg->iops_conf[READ][LIMIT_MAX]); + tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], + tg->iops_conf[WRITE][LIMIT_MAX]); + + if (index == LIMIT_LOW) { + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) + tg->td->limit_index = LIMIT_LOW; + tg->idletime_threshold = (idle_time == ULONG_MAX) ? + ULONG_MAX : idle_time; + tg->latency_target = (latency_time == ULONG_MAX) ? + ULONG_MAX : latency_time; + } tg_conf_updated(tg); ret = 0; out_finish: @@ -1365,11 +1641,21 @@ out_finish: } static struct cftype throtl_files[] = { +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_LOW, + }, +#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_max, - .write = tg_set_max, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; +static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) +{ + unsigned long rtime = jiffies, wtime = jiffies; + + if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) + rtime = tg->last_low_overflow_time[READ]; + if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + wtime = tg->last_low_overflow_time[WRITE]; + return min(rtime, wtime); +} + +/* tg should not be an intermediate node */ +static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) +{ + struct throtl_service_queue *parent_sq; + struct throtl_grp *parent = tg; + unsigned long ret = __tg_last_low_overflow_time(tg); + + while (true) { + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + if (!parent) + break; + + /* + * The parent doesn't have low limit, it always reaches low + * limit. Its overflow time is useless for children + */ + if (!parent->bps[READ][LIMIT_LOW] && + !parent->iops[READ][LIMIT_LOW] && + !parent->bps[WRITE][LIMIT_LOW] && + !parent->iops[WRITE][LIMIT_LOW]) + continue; + if (time_after(__tg_last_low_overflow_time(parent), ret)) + ret = __tg_last_low_overflow_time(parent); + } + return ret; +} + +static bool throtl_tg_is_idle(struct throtl_grp *tg) +{ + /* + * cgroup is idle if: + * - single idle is too long, longer than a fixed value (in case user + * configure a too big threshold) or 4 times of slice + * - average think time is more than threshold + * - IO latency is largely below threshold + */ + unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + + time = min_t(unsigned long, MAX_IDLE_TIME, time); + return (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && + tg->bad_bio_cnt * 5 < tg->bio_cnt); +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + bool read_limit, write_limit; + + /* + * if cgroup reaches low limit (if low limit is 0, the cgroup always + * reaches), it's ok to upgrade to next limit + */ + read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; + write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; + if (!read_limit && !write_limit) + return true; + if (read_limit && sq->nr_queued[READ] && + (!write_limit || sq->nr_queued[WRITE])) + return true; + if (write_limit && sq->nr_queued[WRITE] && + (!read_limit || sq->nr_queued[READ])) + return true; + + if (time_after_eq(jiffies, + tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && + throtl_tg_is_idle(tg)) + return true; + return false; +} + +static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) +{ + while (true) { + if (throtl_tg_can_upgrade(tg)) + return true; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + return false; + } + return false; +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + if (td->limit_index != LIMIT_LOW) + return false; + + if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) + return false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg == this_tg) + continue; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + continue; + if (!throtl_hierarchy_can_upgrade(tg)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + return true; +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_LOW) + return; + + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + tg->last_check_time = now; + + if (!time_after_eq(now, + __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) + return; + + if (throtl_can_upgrade(tg->td, NULL)) + throtl_upgrade_state(tg->td); +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->scale = 0; + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + tg->disptime = jiffies - 1; + throtl_select_dispatch(sq); + throtl_schedule_next_dispatch(sq, false); + } + rcu_read_unlock(); + throtl_select_dispatch(&td->service_queue); + throtl_schedule_next_dispatch(&td->service_queue, false); + queue_work(kthrotld_workqueue, &td->dispatch_work); +} + +static void throtl_downgrade_state(struct throtl_data *td, int new) +{ + td->scale /= 2; + + if (td->scale) { + td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; + return; + } + + td->limit_index = new; + td->low_downgrade_time = jiffies; +} + +static bool throtl_tg_can_downgrade(struct throtl_grp *tg) +{ + struct throtl_data *td = tg->td; + unsigned long now = jiffies; + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && + time_after_eq(now, tg_last_low_overflow_time(tg) + + td->throtl_slice) && + (!throtl_tg_is_idle(tg) || + !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) + return true; + return false; +} + +static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) +{ + while (true) { + if (!throtl_tg_can_downgrade(tg)) + return false; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + break; + } + return true; +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ + uint64_t bps; + unsigned int iops; + unsigned long elapsed_time; + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_MAX || + !tg->td->limit_valid[LIMIT_LOW]) + return; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + return; + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + elapsed_time = now - tg->last_check_time; + tg->last_check_time = now; + + if (time_before(now, tg_last_low_overflow_time(tg) + + tg->td->throtl_slice)) + return; + + if (tg->bps[READ][LIMIT_LOW]) { + bps = tg->last_bytes_disp[READ] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->bps[WRITE][LIMIT_LOW]) { + bps = tg->last_bytes_disp[WRITE] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + if (tg->iops[READ][LIMIT_LOW]) { + iops = tg->last_io_disp[READ] * HZ / elapsed_time; + if (iops >= tg->iops[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->iops[WRITE][LIMIT_LOW]) { + iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; + if (iops >= tg->iops[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (throtl_hierarchy_can_downgrade(tg)) + throtl_downgrade_state(tg->td, LIMIT_LOW); + + tg->last_bytes_disp[READ] = 0; + tg->last_bytes_disp[WRITE] = 0; + tg->last_io_disp[READ] = 0; + tg->last_io_disp[WRITE] = 0; +} + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ + unsigned long now = ktime_get_ns() >> 10; + unsigned long last_finish_time = tg->last_finish_time; + + if (now <= last_finish_time || last_finish_time == 0 || + last_finish_time == tg->checked_last_finish_time) + return; + + tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; + tg->checked_last_finish_time = last_finish_time; +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_update_latency_buckets(struct throtl_data *td) +{ + struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; + int i, cpu; + unsigned long last_latency = 0; + unsigned long latency; + + if (!blk_queue_nonrot(td->queue)) + return; + if (time_before(jiffies, td->last_calculate_time + HZ)) + return; + td->last_calculate_time = jiffies; + + memset(avg_latency, 0, sizeof(avg_latency)); + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets, cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } + + if (tmp->samples >= 32) { + int samples = tmp->samples; + + latency = tmp->total_latency; + + tmp->total_latency = 0; + tmp->samples = 0; + latency /= samples; + if (latency == 0) + continue; + avg_latency[i].latency = latency; + } + } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[i].latency) { + if (td->avg_buckets[i].latency < last_latency) + td->avg_buckets[i].latency = last_latency; + continue; + } + + if (!td->avg_buckets[i].valid) + latency = avg_latency[i].latency; + else + latency = (td->avg_buckets[i].latency * 7 + + avg_latency[i].latency) >> 3; + + td->avg_buckets[i].latency = max(latency, last_latency); + td->avg_buckets[i].valid = true; + last_latency = td->avg_buckets[i].latency; + } +} +#else +static inline void throtl_update_latency_buckets(struct throtl_data *td) +{ +} +#endif + +static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) +{ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + int ret; + + ret = bio_associate_current(bio); + if (ret == 0 || ret == -EBUSY) + bio->bi_cg_private = tg; + blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); +#else + bio_associate_current(bio); +#endif +} + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { @@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; + struct throtl_data *td = tg->td; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, spin_lock_irq(q->queue_lock); + throtl_update_latency_buckets(td); + if (unlikely(blk_queue_bypass(q))) goto out_unlock; + blk_throtl_assoc_bio(tg, bio); + blk_throtl_update_idletime(tg); + sq = &tg->service_queue; +again: while (true) { + if (tg->last_low_overflow_time[rw] == 0) + tg->last_low_overflow_time[rw] = jiffies; + throtl_downgrade_check(tg); + throtl_upgrade_check(tg); /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) + if (!tg_may_dispatch(tg, bio, NULL)) { + tg->last_low_overflow_time[rw] = jiffies; + if (throtl_can_upgrade(td, tg)) { + throtl_upgrade_state(td); + goto again; + } break; + } /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); @@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], - tg->io_disp[rw], tg->iops[rw], + tg->bytes_disp[rw], bio->bi_iter.bi_size, + tg_bps_limit(tg, rw), + tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); - bio_associate_current(bio); - tg->td->nr_queued[rw]++; + tg->last_low_overflow_time[rw] = jiffies; + + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -1483,9 +2155,94 @@ out: */ if (!throttled) bio_clear_flag(bio, BIO_THROTTLED); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (throttled || !td->track_bio_latency) + bio->bi_issue_stat.stat |= SKIP_LATENCY; +#endif return throttled; } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_track_latency(struct throtl_data *td, sector_t size, + int op, unsigned long time) +{ + struct latency_bucket *latency; + int index; + + if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + !blk_queue_nonrot(td->queue)) + return; + + index = request_bucket_index(size); + + latency = get_cpu_ptr(td->latency_buckets); + latency[index].total_latency += time; + latency[index].samples++; + put_cpu_ptr(td->latency_buckets); +} + +void blk_throtl_stat_add(struct request *rq, u64 time_ns) +{ + struct request_queue *q = rq->q; + struct throtl_data *td = q->td; + + throtl_track_latency(td, blk_stat_size(&rq->issue_stat), + req_op(rq), time_ns >> 10); +} + +void blk_throtl_bio_endio(struct bio *bio) +{ + struct throtl_grp *tg; + u64 finish_time_ns; + unsigned long finish_time; + unsigned long start_time; + unsigned long lat; + + tg = bio->bi_cg_private; + if (!tg) + return; + bio->bi_cg_private = NULL; + + finish_time_ns = ktime_get_ns(); + tg->last_finish_time = finish_time_ns >> 10; + + start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; + finish_time = __blk_stat_time(finish_time_ns) >> 10; + if (!start_time || finish_time <= start_time) + return; + + lat = finish_time - start_time; + /* this is only for bio based driver */ + if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) + throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), + bio_op(bio), lat); + + if (tg->latency_target) { + int bucket; + unsigned int threshold; + + bucket = request_bucket_index( + blk_stat_size(&bio->bi_issue_stat)); + threshold = tg->td->avg_buckets[bucket].latency + + tg->latency_target; + if (lat > threshold) + tg->bad_bio_cnt++; + /* + * Not race free, could get wrong count, which means cgroups + * will be throttled + */ + tg->bio_cnt++; + } + + if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { + tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; + tg->bio_cnt /= 2; + tg->bad_bio_cnt /= 2; + } +} +#endif + /* * Dispatch all bios from all children tg's queued on @parent_sq. On * return, @parent_sq is guaranteed to not have any active children tg's @@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; + td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets) { + kfree(td); + return -ENOMEM; + } INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); @@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q) q->td = td; td->queue = q; + td->limit_valid[LIMIT_MAX] = true; + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->low_downgrade_time = jiffies; + /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); - if (ret) + if (ret) { + free_percpu(td->latency_buckets); kfree(td); + } return ret; } @@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + free_percpu(q->td->latency_buckets); kfree(q->td); } +void blk_throtl_register_queue(struct request_queue *q) +{ + struct throtl_data *td; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td = q->td; + BUG_ON(!td); + + if (blk_queue_nonrot(q)) { + td->throtl_slice = DFL_THROTL_SLICE_SSD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; + } else { + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; + } +#ifndef CONFIG_BLK_DEV_THROTTLING_LOW + /* if no low limit, use previous default */ + td->throtl_slice = DFL_THROTL_SLICE_HD; +#endif + + td->track_bio_latency = !q->mq_ops && !q->request_fn; + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + + /* + * some tg are created before queue is fully initialized, eg, nonrot + * isn't initialized yet + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + tg->idletime_threshold = td->dft_idletime_threshold; + } + rcu_read_unlock(); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) +{ + if (!q->td) + return -EINVAL; + return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); +} + +ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long v; + unsigned long t; + + if (!q->td) + return -EINVAL; + if (kstrtoul(page, 10, &v)) + return -EINVAL; + t = msecs_to_jiffies(v); + if (t == 0 || t > MAX_THROTL_SLICE) + return -EINVAL; + q->td->throtl_slice = t; + return count; +} +#endif + static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); |