diff options
-rw-r--r-- | block/Kconfig.iosched | 4 | ||||
-rw-r--r-- | block/blk-cgroup.c | 1506 | ||||
-rw-r--r-- | block/blk-cgroup.h | 324 | ||||
-rw-r--r-- | block/blk-core.c | 180 | ||||
-rw-r--r-- | block/blk-ioc.c | 126 | ||||
-rw-r--r-- | block/blk-sysfs.c | 6 | ||||
-rw-r--r-- | block/blk-throttle.c | 450 | ||||
-rw-r--r-- | block/blk.h | 32 | ||||
-rw-r--r-- | block/cfq-iosched.c | 608 | ||||
-rw-r--r-- | block/cfq.h | 113 | ||||
-rw-r--r-- | block/deadline-iosched.c | 8 | ||||
-rw-r--r-- | block/elevator.c | 123 | ||||
-rw-r--r-- | block/noop-iosched.c | 8 | ||||
-rw-r--r-- | fs/bio.c | 61 | ||||
-rw-r--r-- | fs/ioprio.c | 2 | ||||
-rw-r--r-- | include/linux/bio.h | 8 | ||||
-rw-r--r-- | include/linux/blk_types.h | 10 | ||||
-rw-r--r-- | include/linux/blkdev.h | 12 | ||||
-rw-r--r-- | include/linux/elevator.h | 8 | ||||
-rw-r--r-- | include/linux/iocontext.h | 39 | ||||
-rw-r--r-- | include/linux/ioprio.h | 22 | ||||
-rw-r--r-- | init/Kconfig | 2 | ||||
-rw-r--r-- | kernel/fork.c | 5 |
23 files changed, 1745 insertions, 1912 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795d..421bef9c4c48 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,8 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - # If BLK_CGROUP is a module, CFQ has to be built as module. - depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -34,8 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. - Note: If BLK_CGROUP=m, then CFQ can be built only as module. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 126c341955de..4fdeb46b4436 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -17,71 +17,38 @@ #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" #include <linux/genhd.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include "blk-cgroup.h" +#include "blk.h" #define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); +static DEFINE_MUTEX(all_q_mutex); +static LIST_HEAD(all_q_list); + +/* List of groups pending per cpu stats allocation */ +static DEFINE_SPINLOCK(alloc_list_lock); +static LIST_HEAD(alloc_list); + +static void blkio_stat_alloc_fn(struct work_struct *); +static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn); + struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); +static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES]; + /* for encoding cft->private value on file */ #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) /* What policy owns the file, proportional or throttle */ #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) #define BLKIOFILE_ATTR(val) ((val) & 0xffff) -static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) -{ - list_add(&pn->node, &blkcg->policy_list); -} - -static inline bool cftype_blkg_same_policy(struct cftype *cft, - struct blkio_group *blkg) -{ - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - - if (blkg->plid == plid) - return 1; - - return 0; -} - -/* Determines if policy node matches cgroup file being accessed */ -static inline bool pn_matches_cftype(struct cftype *cft, - struct blkio_policy_node *pn) -{ - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - return (plid == pn->plid && fileid == pn->fileid); -} - -/* Must be called with blkcg->lock held */ -static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) -{ - list_del(&pn->node); -} - -/* Must be called with blkcg->lock held */ -static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, - enum blkio_policy_id plid, int fileid) -{ - struct blkio_policy_node *pn; - - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) - return pn; - } - - return NULL; -} - struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), @@ -89,77 +56,85 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) { return container_of(task_subsys_state(tsk, blkio_subsys_id), struct blkio_cgroup, css); } -EXPORT_SYMBOL_GPL(task_blkio_cgroup); -static inline void -blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio) +{ + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkio_cgroup, css); + return task_blkio_cgroup(current); +} +EXPORT_SYMBOL_GPL(bio_blkio_cgroup); + +static inline void blkio_update_group_weight(struct blkio_group *blkg, + int plid, unsigned int weight) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkiop->ops.blkio_update_group_weight_fn(blkg->q, blkg, weight); } } -static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, - int fileid) +static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid, + u64 bps, int fileid) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (fileid == BLKIO_THROTL_read_bps_device && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkiop->ops.blkio_update_group_read_bps_fn(blkg->q, blkg, bps); if (fileid == BLKIO_THROTL_write_bps_device && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkiop->ops.blkio_update_group_write_bps_fn(blkg->q, blkg, bps); } } static inline void blkio_update_group_iops(struct blkio_group *blkg, - unsigned int iops, int fileid) + int plid, unsigned int iops, + int fileid) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (fileid == BLKIO_THROTL_read_iops_device && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkiop->ops.blkio_update_group_read_iops_fn(blkg->q, blkg, iops); if (fileid == BLKIO_THROTL_write_iops_device && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkiop->ops.blkio_update_group_write_iops_fn(blkg->q, blkg,iops); } } /* * Add to the appropriate stat variable depending on the request type. - * This should be called with the blkg->stats_lock held. + * This should be called with queue_lock held. */ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, bool sync) @@ -177,7 +152,7 @@ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, /* * Decrements the appropriate stat variable if non-zero depending on the * request type. Panics on value being zero. - * This should be called with the blkg->stats_lock held. + * This should be called with the queue_lock held. */ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) { @@ -198,19 +173,22 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) } #ifdef CONFIG_DEBUG_BLK_CGROUP -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg) { - if (blkio_blkg_waiting(&blkg->stats)) + struct blkg_policy_data *pd = blkg->pd[pol->plid]; + + if (blkio_blkg_waiting(&pd->stats)) return; if (blkg == curr_blkg) return; - blkg->stats.start_group_wait_time = sched_clock(); - blkio_mark_blkg_waiting(&blkg->stats); + pd->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&pd->stats); } -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_update_group_wait_time(struct blkio_group_stats *stats) { unsigned long long now; @@ -224,7 +202,7 @@ static void blkio_update_group_wait_time(struct blkio_group_stats *stats) blkio_clear_blkg_waiting(stats); } -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_end_empty_time(struct blkio_group_stats *stats) { unsigned long long now; @@ -238,132 +216,146 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats) blkio_clear_blkg_empty(stats); } -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); + BUG_ON(blkio_blkg_idling(stats)); - spin_lock_irqsave(&blkg->stats_lock, flags); - BUG_ON(blkio_blkg_idling(&blkg->stats)); - blkg->stats.start_idle_time = sched_clock(); - blkio_mark_blkg_idling(&blkg->stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + stats->start_idle_time = sched_clock(); + blkio_mark_blkg_idling(stats); } EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +void blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - unsigned long long now; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; if (blkio_blkg_idling(stats)) { - now = sched_clock(); - if (time_after64(now, stats->start_idle_time)) + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) { + u64_stats_update_begin(&stats->syncp); stats->idle_time += now - stats->start_idle_time; + u64_stats_update_end(&stats->syncp); + } blkio_clear_blkg_idling(stats); } - spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + u64_stats_update_begin(&stats->syncp); stats->avg_queue_size_sum += stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; stats->avg_queue_size_samples++; blkio_update_group_wait_time(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); -void blkiocg_set_start_empty_time(struct blkio_group *blkg) +void blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + lockdep_assert_held(blkg->q->queue_lock); if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) return; - } /* * group is already marked empty. This can happen if cfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ - if(blkio_blkg_empty(stats)) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); + if (blkio_blkg_empty(stats)) return; - } stats->start_empty_time = sched_clock(); blkio_mark_blkg_empty(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) + struct blkio_policy_type *pol, + unsigned long dequeue) { - blkg->stats.dequeue += dequeue; + struct blkg_policy_data *pd = blkg->pd[pol->plid]; + + lockdep_assert_held(blkg->q->queue_lock); + + pd->stats.dequeue += dequeue; } EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); #else static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) {} -static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg) { } +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { } #endif void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, - bool sync) + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, bool direction, + bool sync) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, - sync); - blkio_end_empty_time(&blkg->stats); - blkio_set_start_group_wait_time(blkg, curr_blkg); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_begin(&stats->syncp); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync); + blkio_end_empty_time(stats); + u64_stats_update_end(&stats->syncp); + + blkio_set_start_group_wait_time(blkg, pol, curr_blkg); } EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) + struct blkio_policy_type *pol, + bool direction, bool sync) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); + blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction, + sync); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, - unsigned long unaccounted_time) +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + struct blkio_policy_type *pol, + unsigned long time, + unsigned long unaccounted_time) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkg->stats.time += time; + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); + stats->time += time; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg->stats.unaccounted_time += unaccounted_time; + stats->unaccounted_time += unaccounted_time; #endif - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); @@ -372,11 +364,17 @@ EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); * is valid. */ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) + struct blkio_policy_type *pol, + uint64_t bytes, bool direction, bool sync) { + struct blkg_policy_data *pd = blkg->pd[pol->plid]; struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (pd->stats_cpu == NULL) + return; + /* * Disabling interrupts to provide mutual exclusion between two * writes on same cpu. It probably is not needed for 64bit. Not @@ -384,7 +382,7 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, */ local_irq_save(flags); - stats_cpu = this_cpu_ptr(blkg->stats_cpu); + stats_cpu = this_cpu_ptr(pd->stats_cpu); u64_stats_update_begin(&stats_cpu->syncp); stats_cpu->sectors += bytes >> 9; @@ -398,213 +396,419 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) + struct blkio_policy_type *pol, + uint64_t start_time, + uint64_t io_start_time, bool direction, + bool sync) { - struct blkio_group_stats *stats; - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; unsigned long long now = sched_clock(); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); if (time_after64(now, io_start_time)) blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], now - io_start_time, direction, sync); if (time_after64(io_start_time, start_time)) blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], io_start_time - start_time, direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); /* Merged stats are per cpu. */ -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync) +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, + bool direction, bool sync) { - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); + lockdep_assert_held(blkg->q->queue_lock); - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, - direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); + u64_stats_update_begin(&stats->syncp); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); /* - * This function allocates the per cpu stats for blkio_group. Should be called - * from sleepable context as alloc_per_cpu() requires that. + * Worker for allocating per cpu stat for blk groups. This is scheduled on + * the system_nrt_wq once there are some groups on the alloc_list waiting + * for allocation. */ -int blkio_alloc_blkg_stats(struct blkio_group *blkg) +static void blkio_stat_alloc_fn(struct work_struct *work) { - /* Allocate memory for per cpu stats */ - blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); - if (!blkg->stats_cpu) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + static void *pcpu_stats[BLKIO_NR_POLICIES]; + struct delayed_work *dwork = to_delayed_work(work); + struct blkio_group *blkg; + int i; + bool empty = false; -void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) -{ - unsigned long flags; +alloc_stats: + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + if (pcpu_stats[i] != NULL) + continue; - spin_lock_irqsave(&blkcg->lock, flags); - spin_lock_init(&blkg->stats_lock); - rcu_assign_pointer(blkg->key, key); - blkg->blkcg_id = css_id(&blkcg->css); - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - blkg->plid = plid; - spin_unlock_irqrestore(&blkcg->lock, flags); - /* Need to take css reference ? */ - cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); - blkg->dev = dev; + pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu); + + /* Allocation failed. Try again after some time. */ + if (pcpu_stats[i] == NULL) { + queue_delayed_work(system_nrt_wq, dwork, + msecs_to_jiffies(10)); + return; + } + } + + spin_lock_irq(&blkio_list_lock); + spin_lock(&alloc_list_lock); + + /* cgroup got deleted or queue exited. */ + if (!list_empty(&alloc_list)) { + blkg = list_first_entry(&alloc_list, struct blkio_group, + alloc_node); + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkg_policy_data *pd = blkg->pd[i]; + + if (blkio_policy[i] && pd && !pd->stats_cpu) + swap(pd->stats_cpu, pcpu_stats[i]); + } + + list_del_init(&blkg->alloc_node); + } + + empty = list_empty(&alloc_list); + + spin_unlock(&alloc_list_lock); + spin_unlock_irq(&blkio_list_lock); + + if (!empty) + goto alloc_stats; } -EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); -static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkio_group *blkg) { - hlist_del_init_rcu(&blkg->blkcg_node); - blkg->blkcg_id = 0; + int i; + + if (!blkg) + return; + + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkg_policy_data *pd = blkg->pd[i]; + + if (pd) { + free_percpu(pd->stats_cpu); + kfree(pd); + } + } + + kfree(blkg); } -/* - * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 - * indicating that blk_group was unhashed by the time we got to it. +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * + * Allocate a new blkg assocating @blkcg and @q. */ -int blkiocg_del_blkio_group(struct blkio_group *blkg) +static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg, + struct request_queue *q) { - struct blkio_cgroup *blkcg; - unsigned long flags; - struct cgroup_subsys_state *css; - int ret = 1; + struct blkio_group *blkg; + int i; - rcu_read_lock(); - css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (css) { - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + if (!blkg) + return NULL; + + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + INIT_LIST_HEAD(&blkg->alloc_node); + blkg->blkcg = blkcg; + blkg->refcnt = 1; + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); + + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkio_policy_type *pol = blkio_policy[i]; + struct blkg_policy_data *pd; + + if (!pol) + continue; + + /* alloc per-policy data and attach it to blkg */ + pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC, + q->node); + if (!pd) { + blkg_free(blkg); + return NULL; } - spin_unlock_irqrestore(&blkcg->lock, flags); + + blkg->pd[i] = pd; + pd->blkg = blkg; } - rcu_read_unlock(); - return ret; + /* invoke per-policy init */ + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkio_policy_type *pol = blkio_policy[i]; + + if (pol) + pol->ops.blkio_init_group_fn(blkg); + } + + return blkg; +} + +struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg, + struct request_queue *q, + enum blkio_policy_id plid, + bool for_root) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct blkio_group *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + * The following can be removed if blkg lookup is guaranteed to + * fail on a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q)) && !for_root) + return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); + + blkg = blkg_lookup(blkcg, q); + if (blkg) + return blkg; + + /* blkg holds a reference to blkcg */ + if (!css_tryget(&blkcg->css)) + return ERR_PTR(-EINVAL); + + /* + * Allocate and initialize. + */ + blkg = blkg_alloc(blkcg, q); + + /* did alloc fail? */ + if (unlikely(!blkg)) { + blkg = ERR_PTR(-ENOMEM); + goto out; + } + + /* insert */ + spin_lock(&blkcg->lock); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); + spin_unlock(&blkcg->lock); + + spin_lock(&alloc_list_lock); + list_add(&blkg->alloc_node, &alloc_list); + /* Queue per cpu stat allocation from worker thread. */ + queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0); + spin_unlock(&alloc_list_lock); +out: + return blkg; } -EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); +EXPORT_SYMBOL_GPL(blkg_lookup_create); /* called under rcu_read_lock(). */ -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, + struct request_queue *q) { struct blkio_group *blkg; struct hlist_node *n; - void *__key; - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - __key = blkg->key; - if (__key == key) + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) + if (blkg->q == q) return blkg; + return NULL; +} +EXPORT_SYMBOL_GPL(blkg_lookup); + +static void blkg_destroy(struct blkio_group *blkg) +{ + struct request_queue *q = blkg->q; + struct blkio_cgroup *blkcg = blkg->blkcg; + + lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&blkcg->lock); + + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); + + spin_lock(&alloc_list_lock); + list_del_init(&blkg->alloc_node); + spin_unlock(&alloc_list_lock); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + blkg_put(blkg); +} + +/* + * XXX: This updates blkg policy data in-place for root blkg, which is + * necessary across elevator switch and policy registration as root blkgs + * aren't shot down. This broken and racy implementation is temporary. + * Eventually, blkg shoot down will be replaced by proper in-place update. + */ +void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid) +{ + struct blkio_policy_type *pol = blkio_policy[plid]; + struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q); + struct blkg_policy_data *pd; + + if (!blkg) + return; + + kfree(blkg->pd[plid]); + blkg->pd[plid] = NULL; + + if (!pol) + return; + + pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL); + WARN_ON_ONCE(!pd); + + pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); + WARN_ON_ONCE(!pd->stats_cpu); + + blkg->pd[plid] = pd; + pd->blkg = blkg; + pol->ops.blkio_init_group_fn(blkg); +} +EXPORT_SYMBOL_GPL(update_root_blkg_pd); + +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * @destroy_root: whether to destroy root blkg or not + * + * Destroy blkgs associated with @q. If @destroy_root is %true, all are + * destroyed; otherwise, root blkg is left alone. + */ +void blkg_destroy_all(struct request_queue *q, bool destroy_root) +{ + struct blkio_group *blkg, *n; + + spin_lock_irq(q->queue_lock); + + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkio_cgroup *blkcg = blkg->blkcg; + + /* skip root? */ + if (!destroy_root && blkg->blkcg == &blkio_root_cgroup) + continue; + + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); } - return NULL; + spin_unlock_irq(q->queue_lock); } -EXPORT_SYMBOL_GPL(blkiocg_lookup_group); +EXPORT_SYMBOL_GPL(blkg_destroy_all); -static void blkio_reset_stats_cpu(struct blkio_group *blkg) +static void blkg_rcu_free(struct rcu_head *rcu_head) { - struct blkio_group_stats_cpu *stats_cpu; - int i, j, k; + blkg_free(container_of(rcu_head, struct blkio_group, rcu_head)); +} + +void __blkg_release(struct blkio_group *blkg) +{ + /* release the extra blkcg reference this blkg has been holding */ + css_put(&blkg->blkcg->css); + /* - * Note: On 64 bit arch this should not be an issue. This has the - * possibility of returning some inconsistent value on 32bit arch - * as 64bit update on 32bit is non atomic. Taking care of this - * corner case makes code very complicated, like sending IPIs to - * cpus, taking care of stats of offline cpus etc. + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. * - * reset stats is anyway more of a debug feature and this sounds a - * corner case. So I am not complicating the code yet until and - * unless this becomes a real issue. + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits */ - for_each_possible_cpu(i) { - stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); - stats_cpu->sectors = 0; - for(j = 0; j < BLKIO_STAT_CPU_NR; j++) - for (k = 0; k < BLKIO_STAT_TOTAL; k++) - stats_cpu->stat_arr_cpu[j][k] = 0; + call_rcu(&blkg->rcu_head, blkg_rcu_free); +} +EXPORT_SYMBOL_GPL(__blkg_release); + +static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid) +{ + struct blkg_policy_data *pd = blkg->pd[plid]; + int cpu; + + if (pd->stats_cpu == NULL) + return; + + for_each_possible_cpu(cpu) { + struct blkio_group_stats_cpu *sc = + per_cpu_ptr(pd->stats_cpu, cpu); + + sc->sectors = 0; + memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu)); } } static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { - struct blkio_cgroup *blkcg; + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); struct blkio_group *blkg; - struct blkio_group_stats *stats; struct hlist_node *n; - uint64_t queued[BLKIO_STAT_TOTAL]; int i; -#ifdef CONFIG_DEBUG_BLK_CGROUP - bool idling, waiting, empty; - unsigned long long now = sched_clock(); -#endif - blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); + + /* + * Note that stat reset is racy - it doesn't synchronize against + * stat updates. This is a debug feature which shouldn't exist + * anyway. If you get hit by a race, retry. + */ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkg->stats_lock); - stats = &blkg->stats; + struct blkio_policy_type *pol; + + list_for_each_entry(pol, &blkio_list, list) { + struct blkg_policy_data *pd = blkg->pd[pol->plid]; + struct blkio_group_stats *stats = &pd->stats; + + /* queued stats shouldn't be cleared */ + for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++) + if (i != BLKIO_STAT_QUEUED) + memset(stats->stat_arr[i], 0, + sizeof(stats->stat_arr[i])); + stats->time = 0; #ifdef CONFIG_DEBUG_BLK_CGROUP - idling = blkio_blkg_idling(stats); - waiting = blkio_blkg_waiting(stats); - empty = blkio_blkg_empty(stats); + memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0, + BLKG_STATS_DEBUG_CLEAR_SIZE); #endif - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; - memset(stats, 0, sizeof(struct blkio_group_stats)); - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (idling) { - blkio_mark_blkg_idling(stats); - stats->start_idle_time = now; - } - if (waiting) { - blkio_mark_blkg_waiting(stats); - stats->start_group_wait_time = now; - } - if (empty) { - blkio_mark_blkg_empty(stats); - stats->start_empty_time = now; + blkio_reset_stats_cpu(blkg, pol->plid); } -#endif - spin_unlock(&blkg->stats_lock); - - /* Reset Per cpu stats which don't take blkg->stats_lock */ - blkio_reset_stats_cpu(blkg); } spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); return 0; } -static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, - int chars_left, bool diskname_only) +static void blkio_get_key_name(enum stat_sub_type type, const char *dname, + char *str, int chars_left, bool diskname_only) { - snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + snprintf(str, chars_left, "%s", dname); chars_left -= strlen(str); if (chars_left <= 0) { printk(KERN_WARNING @@ -634,25 +838,20 @@ static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, } } -static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, - struct cgroup_map_cb *cb, dev_t dev) -{ - blkio_get_key_name(0, dev, str, chars_left, true); - cb->fill(cb, str, val); - return val; -} - - -static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, +static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid, enum stat_type_cpu type, enum stat_sub_type sub_type) { + struct blkg_policy_data *pd = blkg->pd[plid]; int cpu; struct blkio_group_stats_cpu *stats_cpu; u64 val = 0, tval; + if (pd->stats_cpu == NULL) + return val; + for_each_possible_cpu(cpu) { unsigned int start; - stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); + stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu); do { start = u64_stats_fetch_begin(&stats_cpu->syncp); @@ -668,87 +867,115 @@ static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, return val; } -static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, - struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) +static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid, + struct cgroup_map_cb *cb, const char *dname, + enum stat_type_cpu type) { uint64_t disk_total, val; char key_str[MAX_KEY_LEN]; enum stat_sub_type sub_type; if (type == BLKIO_STAT_CPU_SECTORS) { - val = blkio_read_stat_cpu(blkg, type, 0); - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); + val = blkio_read_stat_cpu(blkg, plid, type, 0); + blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true); + cb->fill(cb, key_str, val); + return val; } for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; sub_type++) { - blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); - val = blkio_read_stat_cpu(blkg, type, sub_type); + blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN, + false); + val = blkio_read_stat_cpu(blkg, plid, type, sub_type); cb->fill(cb, key_str, val); } - disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + - blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); + disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) + + blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE); - blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN, + false); cb->fill(cb, key_str, disk_total); return disk_total; } -/* This should be called with blkg->stats_lock held */ -static uint64_t blkio_get_stat(struct blkio_group *blkg, - struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) +static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid, + struct cgroup_map_cb *cb, const char *dname, + enum stat_type type) { - uint64_t disk_total; + struct blkio_group_stats *stats = &blkg->pd[plid]->stats; + uint64_t v = 0, disk_total = 0; char key_str[MAX_KEY_LEN]; - enum stat_sub_type sub_type; + unsigned int sync_start; + int st; - if (type == BLKIO_STAT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.time, cb, dev); + if (type >= BLKIO_STAT_ARR_NR) { + do { + sync_start = u64_stats_fetch_begin(&stats->syncp); + switch (type) { + case BLKIO_STAT_TIME: + v = stats->time; + break; #ifdef CONFIG_DEBUG_BLK_CGROUP - if (type == BLKIO_STAT_UNACCOUNTED_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.unaccounted_time, cb, dev); - if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { - uint64_t sum = blkg->stats.avg_queue_size_sum; - uint64_t samples = blkg->stats.avg_queue_size_samples; - if (samples) - do_div(sum, samples); - else - sum = 0; - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); - } - if (type == BLKIO_STAT_GROUP_WAIT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.group_wait_time, cb, dev); - if (type == BLKIO_STAT_IDLE_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.idle_time, cb, dev); - if (type == BLKIO_STAT_EMPTY_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.empty_time, cb, dev); - if (type == BLKIO_STAT_DEQUEUE) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.dequeue, cb, dev); + case BLKIO_STAT_UNACCOUNTED_TIME: + v = stats->unaccounted_time; + break; + case BLKIO_STAT_AVG_QUEUE_SIZE: { + uint64_t samples = stats->avg_queue_size_samples; + + if (samples) { + v = stats->avg_queue_size_sum; + do_div(v, samples); + } + break; + } + case BLKIO_STAT_IDLE_TIME: + v = stats->idle_time; + break; + case BLKIO_STAT_EMPTY_TIME: + v = stats->empty_time; + break; + case BLKIO_STAT_DEQUEUE: + v = stats->dequeue; + break; + case BLKIO_STAT_GROUP_WAIT_TIME: + v = stats->group_wait_time; + break; #endif + default: + WARN_ON_ONCE(1); + } + } while (u64_stats_fetch_retry(&stats->syncp, sync_start)); - for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; - sub_type++) { - blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true); + cb->fill(cb, key_str, v); + return v; + } + + for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) { + do { + sync_start = u64_stats_fetch_begin(&stats->syncp); + v = stats->stat_arr[type][st]; + } while (u64_stats_fetch_retry(&stats->syncp, sync_start)); + + blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, v); + if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE) + disk_total += v; } - disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + - blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; - blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + + blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN, + false); cb->fill(cb, key_str, disk_total); return disk_total; } -static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) +static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid, + int fileid, struct blkio_cgroup *blkcg) { struct gendisk *disk = NULL; + struct blkio_group *blkg = NULL; + struct blkg_policy_data *pd; char *s[4], *p, *major_s = NULL, *minor_s = NULL; unsigned long major, minor; int i = 0, ret = -EINVAL; @@ -793,43 +1020,53 @@ static int blkio_policy_parse_and_set(char *buf, if (strict_strtoull(s[1], 10, &temp)) goto out; - /* For rule removal, do not check for device presence. */ - if (temp) { - disk = get_gendisk(dev, &part); - if (!disk || part) { - ret = -ENODEV; - goto out; - } + disk = get_gendisk(dev, &part); + if (!disk || part) + goto out; + + rcu_read_lock(); + + spin_lock_irq(disk->queue->queue_lock); + blkg = blkg_lookup_create(blkcg, disk->queue, plid, false); + spin_unlock_irq(disk->queue->queue_lock); + + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto out_unlock; } - newpn->dev = dev; + pd = blkg->pd[plid]; switch (plid) { case BLKIO_POLICY_PROP: if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || temp > BLKIO_WEIGHT_MAX) - goto out; + goto out_unlock; - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.weight = temp; + pd->conf.weight = temp; + blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight); break; case BLKIO_POLICY_THROTL: switch(fileid) { case BLKIO_THROTL_read_bps_device: + pd->conf.bps[READ] = temp; + blkio_update_group_bps(blkg, plid, temp ?: -1, fileid); + break; case BLKIO_THROTL_write_bps_device: - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.bps = temp; + pd->conf.bps[WRITE] = temp; + blkio_update_group_bps(blkg, plid, temp ?: -1, fileid); break; case BLKIO_THROTL_read_iops_device: + if (temp > THROTL_IOPS_MAX) + goto out_unlock; + pd->conf.iops[READ] = temp; + blkio_update_group_iops(blkg, plid, temp ?: -1, fileid); + break; case BLKIO_THROTL_write_iops_device: if (temp > THROTL_IOPS_MAX) - goto out; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.iops = (unsigned int)temp; + goto out_unlock; + pd->conf.iops[WRITE] = temp; + blkio_update_group_iops(blkg, plid, temp ?: -1, fileid); break; } break; @@ -837,204 +1074,22 @@ static int blkio_policy_parse_and_set(char *buf, BUG(); } ret = 0; +out_unlock: + rcu_read_unlock(); out: put_disk(disk); - return ret; -} - -unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int weight; - - spin_lock_irqsave(&blkcg->lock, flags); - - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, - BLKIO_PROP_weight_device); - if (pn) - weight = pn->val.weight; - else - weight = blkcg->weight; - - spin_unlock_irqrestore(&blkcg->lock, flags); - - return weight; -} -EXPORT_SYMBOL_GPL(blkcg_get_weight); -uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - uint64_t bps = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device); - if (pn) - bps = pn->val.bps; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return bps; -} - -uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - uint64_t bps = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device); - if (pn) - bps = pn->val.bps; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return bps; -} - -unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int iops = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device); - if (pn) - iops = pn->val.iops; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return iops; -} - -unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int iops = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device); - if (pn) - iops = pn->val.iops; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return iops; -} - -/* Checks whether user asked for deleting a policy rule */ -static bool blkio_delete_rule_command(struct blkio_policy_node *pn) -{ - switch(pn->plid) { - case BLKIO_POLICY_PROP: - if (pn->val.weight == 0) - return 1; - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - if (pn->val.bps == 0) - return 1; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - if (pn->val.iops == 0) - return 1; - } - break; - default: - BUG(); - } - - return 0; -} - -static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, - struct blkio_policy_node *newpn) -{ - switch(oldpn->plid) { - case BLKIO_POLICY_PROP: - oldpn->val.weight = newpn->val.weight; - break; - case BLKIO_POLICY_THROTL: - switch(newpn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - oldpn->val.bps = newpn->val.bps; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - oldpn->val.iops = newpn->val.iops; - } - break; - default: - BUG(); - } -} - -/* - * Some rules/values in blkg have changed. Propagate those to respective - * policies. - */ -static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, struct blkio_policy_node *pn) -{ - unsigned int weight, iops; - u64 bps; - - switch(pn->plid) { - case BLKIO_POLICY_PROP: - weight = pn->val.weight ? pn->val.weight : - blkcg->weight; - blkio_update_group_weight(blkg, weight); - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - bps = pn->val.bps ? pn->val.bps : (-1); - blkio_update_group_bps(blkg, bps, pn->fileid); - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - iops = pn->val.iops ? pn->val.iops : (-1); - blkio_update_group_iops(blkg, iops, pn->fileid); - break; - } - break; - default: - BUG(); - } -} - -/* - * A policy node rule has been updated. Propagate this update to all the - * block groups which might be affected by this update. - */ -static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) -{ - struct blkio_group *blkg; - struct hlist_node *n; - - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (pn->dev != blkg->dev || pn->plid != blkg->plid) - continue; - blkio_update_blkg_policy(blkcg, blkg, pn); + /* + * If queue was bypassing, we should retry. Do so after a short + * msleep(). It isn't strictly necessary but queue can be + * bypassing for some time and it's always nice to avoid busy + * looping. + */ + if (ret == -EBUSY) { + msleep(10); + return restart_syscall(); } - - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); + return ret; } static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, @@ -1042,9 +1097,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, { int ret = 0; char *buf; - struct blkio_policy_node *newpn, *pn; - struct blkio_cgroup *blkcg; - int keep_newpn = 0; + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); int fileid = BLKIOFILE_ATTR(cft->private); @@ -1052,71 +1105,52 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, if (!buf) return -ENOMEM; - newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); - if (!newpn) { - ret = -ENOMEM; - goto free_buf; - } - - ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); - if (ret) - goto free_newpn; - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - spin_lock_irq(&blkcg->lock); - - pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); - if (!pn) { - if (!blkio_delete_rule_command(newpn)) { - blkio_policy_insert_node(blkcg, newpn); - keep_newpn = 1; - } - spin_unlock_irq(&blkcg->lock); - goto update_io_group; - } - - if (blkio_delete_rule_command(newpn)) { - blkio_policy_delete_node(pn); - kfree(pn); - spin_unlock_irq(&blkcg->lock); - goto update_io_group; - } - spin_unlock_irq(&blkcg->lock); - - blkio_update_policy_rule(pn, newpn); - -update_io_group: - blkio_update_policy_node_blkg(blkcg, newpn); - -free_newpn: - if (!keep_newpn) - kfree(newpn); -free_buf: + ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg); kfree(buf); return ret; } -static void -blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) +static const char *blkg_dev_name(struct blkio_group *blkg) +{ + /* some drivers (floppy) instantiate a queue w/o disk registered */ + if (blkg->q->backing_dev_info.dev) + return dev_name(blkg->q->backing_dev_info.dev); + return NULL; +} + +static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg, + struct seq_file *m) { - switch(pn->plid) { + int plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + struct blkg_policy_data *pd = blkg->pd[plid]; + const char *dname = blkg_dev_name(blkg); + int rw = WRITE; + + if (!dname) + return; + + switch (plid) { case BLKIO_POLICY_PROP: - if (pn->fileid == BLKIO_PROP_weight_device) - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.weight); + if (pd->conf.weight) + seq_printf(m, "%s\t%u\n", + dname, pd->conf.weight); break; case BLKIO_POLICY_THROTL: - switch(pn->fileid) { + switch (fileid) { case BLKIO_THROTL_read_bps_device: + rw = READ; case BLKIO_THROTL_write_bps_device: - seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.bps); + if (pd->conf.bps[rw]) + seq_printf(m, "%s\t%llu\n", + dname, pd->conf.bps[rw]); break; case BLKIO_THROTL_read_iops_device: + rw = READ; case BLKIO_THROTL_write_iops_device: - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.iops); + if (pd->conf.iops[rw]) + seq_printf(m, "%s\t%u\n", + dname, pd->conf.iops[rw]); break; } break; @@ -1126,20 +1160,16 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) } /* cgroup files which read their data from policy nodes end up here */ -static void blkio_read_policy_node_files(struct cftype *cft, - struct blkio_cgroup *blkcg, struct seq_file *m) +static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg, + struct seq_file *m) { - struct blkio_policy_node *pn; - - if (!list_empty(&blkcg->policy_list)) { - spin_lock_irq(&blkcg->lock); - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (!pn_matches_cftype(cft, pn)) - continue; - blkio_print_policy_node(m, pn); - } - spin_unlock_irq(&blkcg->lock); - } + struct blkio_group *blkg; + struct hlist_node *n; + + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + blkio_print_group_conf(cft, blkg, m); + spin_unlock_irq(&blkcg->lock); } static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, @@ -1155,7 +1185,7 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, case BLKIO_POLICY_PROP: switch(name) { case BLKIO_PROP_weight_device: - blkio_read_policy_node_files(cft, blkcg, m); + blkio_read_conf(cft, blkcg, m); return 0; default: BUG(); @@ -1167,7 +1197,7 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, case BLKIO_THROTL_write_bps_device: case BLKIO_THROTL_read_iops_device: case BLKIO_THROTL_write_iops_device: - blkio_read_policy_node_files(cft, blkcg, m); + blkio_read_conf(cft, blkcg, m); return 0; default: BUG(); @@ -1188,25 +1218,25 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, struct hlist_node *n; uint64_t cgroup_total = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (blkg->dev) { - if (!cftype_blkg_same_policy(cft, blkg)) - continue; - if (pcpu) - cgroup_total += blkio_get_stat_cpu(blkg, cb, - blkg->dev, type); - else { - spin_lock_irq(&blkg->stats_lock); - cgroup_total += blkio_get_stat(blkg, cb, - blkg->dev, type); - spin_unlock_irq(&blkg->stats_lock); - } - } + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + const char *dname = blkg_dev_name(blkg); + int plid = BLKIOFILE_POLICY(cft->private); + + if (!dname) + continue; + if (pcpu) + cgroup_total += blkio_get_stat_cpu(blkg, plid, + cb, dname, type); + else + cgroup_total += blkio_get_stat(blkg, plid, + cb, dname, type); } if (show_total) cb->fill(cb, "Total", cgroup_total); - rcu_read_unlock(); + + spin_unlock_irq(&blkcg->lock); return 0; } @@ -1243,7 +1273,7 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, BLKIO_STAT_WAIT_TIME, 1, 0); case BLKIO_PROP_io_merged: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_MERGED, 1, 1); + BLKIO_STAT_MERGED, 1, 0); case BLKIO_PROP_io_queued: return blkio_read_blkg_stats(blkcg, cft, cb, BLKIO_STAT_QUEUED, 1, 0); @@ -1290,11 +1320,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val) { struct blkio_group *blkg; struct hlist_node *n; - struct blkio_policy_node *pn; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; @@ -1304,13 +1333,12 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) blkcg->weight = (unsigned int)val; hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev, - BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); - if (pn) - continue; + struct blkg_policy_data *pd = blkg->pd[plid]; - blkio_update_group_weight(blkg, blkcg->weight); + if (!pd->conf.weight) + blkio_update_group_weight(blkg, plid, blkcg->weight); } + spin_unlock_irq(&blkcg->lock); spin_unlock(&blkio_list_lock); return 0; @@ -1349,7 +1377,7 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) case BLKIO_POLICY_PROP: switch(name) { case BLKIO_PROP_weight: - return blkio_weight_write(blkcg, val); + return blkio_weight_write(blkcg, plid, val); } break; default: @@ -1518,58 +1546,53 @@ struct cftype blkio_files[] = { { } /* terminate */ }; -static void blkiocg_destroy(struct cgroup *cgroup) +/** + * blkiocg_pre_destroy - cgroup pre_destroy callback + * @cgroup: cgroup of interest + * + * This function is called when @cgroup is about to go away and responsible + * for shooting down all blkgs associated with @cgroup. blkgs should be + * removed while holding both q and blkcg locks. As blkcg lock is nested + * inside q lock, this function performs reverse double lock dancing. + * + * This is the blkcg counterpart of ioc_release_fn(). + */ +static int blkiocg_pre_destroy(struct cgroup *cgroup) { struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - unsigned long flags; - struct blkio_group *blkg; - void *key; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn, *pntmp; - rcu_read_lock(); - do { - spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_irq(&blkcg->lock); - if (hlist_empty(&blkcg->blkg_list)) { - spin_unlock_irqrestore(&blkcg->lock, flags); - break; + while (!hlist_empty(&blkcg->blkg_list)) { + struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkio_group, blkcg_node); + struct request_queue *q = blkg->q; + + if (spin_trylock(q->queue_lock)) { + blkg_destroy(blkg); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irq(&blkcg->lock); + cpu_relax(); + spin_lock_irq(&blkcg->lock); } + } - blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, - blkcg_node); - key = rcu_dereference(blkg->key); - __blkiocg_del_blkio_group(blkg); - - spin_unlock_irqrestore(&blkcg->lock, flags); - - /* - * This blkio_group is being unlinked as associated cgroup is - * going away. Let all the IO controlling policies know about - * this event. - */ - spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) { - if (blkiop->plid != blkg->plid) - continue; - blkiop->ops.blkio_unlink_group_fn(key, blkg); - } - spin_unlock(&blkio_list_lock); - } while (1); + spin_unlock_irq(&blkcg->lock); + return 0; +} - list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { - blkio_policy_delete_node(pn); - kfree(pn); - } +static void blkiocg_destroy(struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - free_css_id(&blkio_subsys, &blkcg->css); - rcu_read_unlock(); if (blkcg != &blkio_root_cgroup) kfree(blkcg); } static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) { + static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkio_cgroup *blkcg; struct cgroup *parent = cgroup->parent; @@ -1583,14 +1606,72 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->weight = BLKIO_WEIGHT_DEFAULT; + blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); INIT_HLIST_HEAD(&blkcg->blkg_list); - INIT_LIST_HEAD(&blkcg->policy_list); return &blkcg->css; } +/** + * blkcg_init_queue - initialize blkcg part of request queue + * @q: request_queue to initialize + * + * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * part of new request_queue @q. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkcg_init_queue(struct request_queue *q) +{ + int ret; + + might_sleep(); + + ret = blk_throtl_init(q); + if (ret) + return ret; + + mutex_lock(&all_q_mutex); + INIT_LIST_HEAD(&q->all_q_node); + list_add_tail(&q->all_q_node, &all_q_list); + mutex_unlock(&all_q_mutex); + + return 0; +} + +/** + * blkcg_drain_queue - drain blkcg part of request_queue + * @q: request_queue to drain + * + * Called from blk_drain_queue(). Responsible for draining blkcg part. + */ +void blkcg_drain_queue(struct request_queue *q) +{ + lockdep_assert_held(q->queue_lock); + + blk_throtl_drain(q); +} + +/** + * blkcg_exit_queue - exit and release blkcg part of request_queue + * @q: request_queue being released + * + * Called from blk_release_queue(). Responsible for exiting blkcg part. + */ +void blkcg_exit_queue(struct request_queue *q) +{ + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); + + blkg_destroy_all(q, true); + + blk_throtl_exit(q); +} + /* * We cannot support shared io contexts, as we have no mean to support * two tasks with the same ioc in two different groups without major rework @@ -1616,63 +1697,74 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } -static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void blkcg_bypass_start(void) + __acquires(&all_q_mutex) { - struct task_struct *task; - struct io_context *ioc; + struct request_queue *q; - cgroup_taskset_for_each(task, cgrp, tset) { - /* we don't lose anything even if ioc allocation fails */ - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc_cgroup_changed(ioc); - put_io_context(ioc); - } + mutex_lock(&all_q_mutex); + + list_for_each_entry(q, &all_q_list, all_q_node) { + blk_queue_bypass_start(q); + blkg_destroy_all(q, false); } } +static void blkcg_bypass_end(void) + __releases(&all_q_mutex) +{ + struct request_queue *q; + + list_for_each_entry(q, &all_q_list, all_q_node) + blk_queue_bypass_end(q); + + mutex_unlock(&all_q_mutex); +} + struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, + .pre_destroy = blkiocg_pre_destroy, .destroy = blkiocg_destroy, -#ifdef CONFIG_BLK_CGROUP - /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ .subsys_id = blkio_subsys_id, -#endif .base_cftypes = blkio_files, - .use_id = 1, .module = THIS_MODULE, }; EXPORT_SYMBOL_GPL(blkio_subsys); void blkio_policy_register(struct blkio_policy_type *blkiop) { + struct request_queue *q; + + blkcg_bypass_start(); spin_lock(&blkio_list_lock); + + BUG_ON(blkio_policy[blkiop->plid]); + blkio_policy[blkiop->plid] = blkiop; list_add_tail(&blkiop->list, &blkio_list); + spin_unlock(&blkio_list_lock); + list_for_each_entry(q, &all_q_list, all_q_node) + update_root_blkg_pd(q, blkiop->plid); + blkcg_bypass_end(); } EXPORT_SYMBOL_GPL(blkio_policy_register); void blkio_policy_unregister(struct blkio_policy_type *blkiop) { + struct request_queue *q; + + blkcg_bypass_start(); spin_lock(&blkio_list_lock); + + BUG_ON(blkio_policy[blkiop->plid] != blkiop); + blkio_policy[blkiop->plid] = NULL; list_del_init(&blkiop->list); + spin_unlock(&blkio_list_lock); + list_for_each_entry(q, &all_q_list, all_q_node) + update_root_blkg_pd(q, blkiop->plid); + blkcg_bypass_end(); } EXPORT_SYMBOL_GPL(blkio_policy_unregister); - -static int __init init_cgroup_blkio(void) -{ - return cgroup_load_subsys(&blkio_subsys); -} - -static void __exit exit_cgroup_blkio(void) -{ - cgroup_unload_subsys(&blkio_subsys); -} - -module_init(init_cgroup_blkio); -module_exit(exit_cgroup_blkio); -MODULE_LICENSE("GPL"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 6f3ace7e792f..1cb8f7643258 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,28 +19,27 @@ enum blkio_policy_id { BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ BLKIO_POLICY_THROTL, /* Throttling */ + + BLKIO_NR_POLICIES, }; /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) - -#ifndef CONFIG_BLK_CGROUP -/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ -extern struct cgroup_subsys blkio_subsys; -#define blkio_subsys_id blkio_subsys.subsys_id -#endif +#ifdef CONFIG_BLK_CGROUP enum stat_type { + /* Number of IOs merged */ + BLKIO_STAT_MERGED, /* Total time spent (in ns) between request dispatch to the driver and * request completion for IOs doen by this cgroup. This may not be * accurate when NCQ is turned on. */ - BLKIO_STAT_SERVICE_TIME = 0, + BLKIO_STAT_SERVICE_TIME, /* Total time spent waiting in scheduler queue in ns */ BLKIO_STAT_WAIT_TIME, /* Number of IOs queued up */ BLKIO_STAT_QUEUED, + /* All the single valued stats go below this */ BLKIO_STAT_TIME, #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -54,6 +53,9 @@ enum stat_type { #endif }; +/* Types lower than this live in stat_arr and have subtypes */ +#define BLKIO_STAT_ARR_NR (BLKIO_STAT_QUEUED + 1) + /* Per cpu stats */ enum stat_type_cpu { BLKIO_STAT_CPU_SECTORS, @@ -61,8 +63,6 @@ enum stat_type_cpu { BLKIO_STAT_CPU_SERVICE_BYTES, /* Total IOs serviced, post merge */ BLKIO_STAT_CPU_SERVICED, - /* Number of IOs merged */ - BLKIO_STAT_CPU_MERGED, BLKIO_STAT_CPU_NR }; @@ -116,13 +116,16 @@ struct blkio_cgroup { unsigned int weight; spinlock_t lock; struct hlist_head blkg_list; - struct list_head policy_list; /* list of blkio_policy_node */ + + /* for policies to test whether associated blkcg has changed */ + uint64_t id; }; struct blkio_group_stats { + struct u64_stats_sync syncp; /* total disk time and nr sectors dispatched by this group */ uint64_t time; - uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; + uint64_t stat_arr[BLKIO_STAT_ARR_NR][BLKIO_STAT_TOTAL]; #ifdef CONFIG_DEBUG_BLK_CGROUP /* Time not charged to this cgroup */ uint64_t unaccounted_time; @@ -136,21 +139,31 @@ struct blkio_group_stats { /* Total time spent waiting for it to be assigned a timeslice. */ uint64_t group_wait_time; - uint64_t start_group_wait_time; /* Time spent idling for this blkio_group */ uint64_t idle_time; - uint64_t start_idle_time; /* * Total time when we have requests queued and do not contain the * current active queue. */ uint64_t empty_time; + + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; #endif }; +#ifdef CONFIG_DEBUG_BLK_CGROUP +#define BLKG_STATS_DEBUG_CLEAR_START \ + offsetof(struct blkio_group_stats, unaccounted_time) +#define BLKG_STATS_DEBUG_CLEAR_SIZE \ + (offsetof(struct blkio_group_stats, start_group_wait_time) - \ + BLKG_STATS_DEBUG_CLEAR_START) +#endif + /* Per cpu blkio group stats */ struct blkio_group_stats_cpu { uint64_t sectors; @@ -158,71 +171,60 @@ struct blkio_group_stats_cpu { struct u64_stats_sync syncp; }; -struct blkio_group { - /* An rcu protected unique identifier for the group */ - void *key; - struct hlist_node blkcg_node; - unsigned short blkcg_id; - /* Store cgroup path */ - char path[128]; - /* The device MKDEV(major, minor), this group has been created for */ - dev_t dev; - /* policy which owns this blk group */ - enum blkio_policy_id plid; +struct blkio_group_conf { + unsigned int weight; + unsigned int iops[2]; + u64 bps[2]; +}; + +/* per-blkg per-policy data */ +struct blkg_policy_data { + /* the blkg this per-policy data belongs to */ + struct blkio_group *blkg; + + /* Configuration */ + struct blkio_group_conf conf; - /* Need to serialize the stats in the case of reset/update */ - spinlock_t stats_lock; struct blkio_group_stats stats; /* Per cpu stats pointer */ struct blkio_group_stats_cpu __percpu *stats_cpu; -}; -struct blkio_policy_node { - struct list_head node; - dev_t dev; - /* This node belongs to max bw policy or porportional weight policy */ - enum blkio_policy_id plid; - /* cgroup file to which this rule belongs to */ - int fileid; - - union { - unsigned int weight; - /* - * Rate read/write in terms of bytes per second - * Whether this rate represents read or write is determined - * by file type "fileid". - */ - u64 bps; - unsigned int iops; - } val; + /* pol->pdata_size bytes of private data used by policy impl */ + char pdata[] __aligned(__alignof__(unsigned long long)); }; -extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, - dev_t dev); +struct blkio_group { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkio_cgroup *blkcg; + /* Store cgroup path */ + char path[128]; + /* reference count */ + int refcnt; -typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); + struct blkg_policy_data *pd[BLKIO_NR_POLICIES]; + + /* List of blkg waiting for per cpu stats memory to be allocated */ + struct list_head alloc_node; + struct rcu_head rcu_head; +}; -typedef void (blkio_update_group_weight_fn) (void *key, +typedef void (blkio_init_group_fn)(struct blkio_group *blkg); +typedef void (blkio_update_group_weight_fn)(struct request_queue *q, struct blkio_group *blkg, unsigned int weight); -typedef void (blkio_update_group_read_bps_fn) (void * key, +typedef void (blkio_update_group_read_bps_fn)(struct request_queue *q, struct blkio_group *blkg, u64 read_bps); -typedef void (blkio_update_group_write_bps_fn) (void *key, +typedef void (blkio_update_group_write_bps_fn)(struct request_queue *q, struct blkio_group *blkg, u64 write_bps); -typedef void (blkio_update_group_read_iops_fn) (void *key, +typedef void (blkio_update_group_read_iops_fn)(struct request_queue *q, struct blkio_group *blkg, unsigned int read_iops); -typedef void (blkio_update_group_write_iops_fn) (void *key, +typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q, struct blkio_group *blkg, unsigned int write_iops); struct blkio_policy_ops { - blkio_unlink_group_fn *blkio_unlink_group_fn; + blkio_init_group_fn *blkio_init_group_fn; blkio_update_group_weight_fn *blkio_update_group_weight_fn; blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; @@ -234,17 +236,86 @@ struct blkio_policy_type { struct list_head list; struct blkio_policy_ops ops; enum blkio_policy_id plid; + size_t pdata_size; /* policy specific private data size */ }; +extern int blkcg_init_queue(struct request_queue *q); +extern void blkcg_drain_queue(struct request_queue *q); +extern void blkcg_exit_queue(struct request_queue *q); + /* Blkio controller policy registration */ extern void blkio_policy_register(struct blkio_policy_type *); extern void blkio_policy_unregister(struct blkio_policy_type *); +extern void blkg_destroy_all(struct request_queue *q, bool destroy_root); +extern void update_root_blkg_pd(struct request_queue *q, + enum blkio_policy_id plid); + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline void *blkg_to_pdata(struct blkio_group *blkg, + struct blkio_policy_type *pol) +{ + return blkg ? blkg->pd[pol->plid]->pdata : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pdata: policy private data of interest + * @pol: policy @pdata is for + * + * @pdata is policy private data for @pol. Determine the blkg it's + * associated with. + */ +static inline struct blkio_group *pdata_to_blkg(void *pdata, + struct blkio_policy_type *pol) +{ + if (pdata) { + struct blkg_policy_data *pd = + container_of(pdata, struct blkg_policy_data, pdata); + return pd->blkg; + } + return NULL; +} static inline char *blkg_path(struct blkio_group *blkg) { return blkg->path; } +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding queue_lock and an existing reference. + */ +static inline void blkg_get(struct blkio_group *blkg) +{ + lockdep_assert_held(blkg->q->queue_lock); + WARN_ON_ONCE(!blkg->refcnt); + blkg->refcnt++; +} + +void __blkg_release(struct blkio_group *blkg); + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + * + * The caller should be holding queue_lock. + */ +static inline void blkg_put(struct blkio_group *blkg) +{ + lockdep_assert_held(blkg->q->queue_lock); + WARN_ON_ONCE(blkg->refcnt <= 0); + if (!--blkg->refcnt) + __blkg_release(blkg); +} + #else struct blkio_group { @@ -253,10 +324,23 @@ struct blkio_group { struct blkio_policy_type { }; +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } - +static inline void blkg_destroy_all(struct request_queue *q, + bool destory_root) { } +static inline void update_root_blkg_pd(struct request_queue *q, + enum blkio_policy_id plid) { } + +static inline void *blkg_to_pdata(struct blkio_group *blkg, + struct blkio_policy_type *pol) { return NULL; } +static inline struct blkio_group *pdata_to_blkg(void *pdata, + struct blkio_policy_type *pol) { return NULL; } static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +static inline void blkg_get(struct blkio_group *blkg) { } +static inline void blkg_put(struct blkio_group *blkg) { } #endif @@ -265,12 +349,17 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } #define BLKIO_WEIGHT_DEFAULT 500 #ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol); void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg); -void blkiocg_set_start_empty_time(struct blkio_group *blkg); + struct blkio_policy_type *pol, + unsigned long dequeue); +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol); +void blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol); +void blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol); #define BLKG_FLAG_FNS(name) \ static inline void blkio_mark_blkg_##name( \ @@ -293,72 +382,77 @@ BLKG_FLAG_FNS(idling) BLKG_FLAG_FNS(empty) #undef BLKG_FLAG_FNS #else -static inline void blkiocg_update_avg_queue_size_stats( - struct blkio_group *blkg) {} +static inline void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} -static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{} -static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} -static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} + struct blkio_policy_type *pol, unsigned long dequeue) { } +static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } +static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } +static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } #endif -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) +#ifdef CONFIG_BLK_CGROUP extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); -extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); -extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid); -extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); -extern int blkiocg_del_blkio_group(struct blkio_group *blkg); -extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, - void *key); +extern struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio); +extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, + struct request_queue *q); +struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg, + struct request_queue *q, + enum blkio_policy_id plid, + bool for_root); void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, - unsigned long unaccounted_time); -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, - bool direction, bool sync); + struct blkio_policy_type *pol, + unsigned long time, + unsigned long unaccounted_time); +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, + uint64_t bytes, bool direction, bool sync); void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync); + struct blkio_policy_type *pol, + uint64_t start_time, + uint64_t io_start_time, bool direction, + bool sync); +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, + bool direction, bool sync); void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync); + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, bool direction, + bool sync); void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync); + struct blkio_policy_type *pol, + bool direction, bool sync); #else struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline struct blkio_cgroup * -task_blkio_cgroup(struct task_struct *tsk) { return NULL; } - -static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) {} - -static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } - -static inline int -blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } +bio_blkio_cgroup(struct bio *bio) { return NULL; } -static inline struct blkio_group * -blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, + void *key) { return NULL; } static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, - unsigned long unaccounted_time) -{} + struct blkio_policy_type *pol, unsigned long time, + unsigned long unaccounted_time) { } static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} + struct blkio_policy_type *pol, uint64_t bytes, + bool direction, bool sync) { } static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, - bool sync) {} + struct blkio_policy_type *pol, uint64_t start_time, + uint64_t io_start_time, bool direction, bool sync) { } static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} + struct blkio_policy_type *pol, bool direction, + bool sync) { } static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, bool direction, + bool sync) { } static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} + struct blkio_policy_type *pol, bool direction, + bool sync) { } #endif #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 3a78b00edd71..991c1d6ef245 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -280,7 +281,7 @@ EXPORT_SYMBOL(blk_stop_queue); * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevaotor_exit() - * and blk_throtl_exit() to be called with queue lock initialized. + * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) @@ -365,17 +366,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); - if (drain_all) - blk_throtl_drain(q); + /* + * The caller might be trying to drain @q before its + * elevator is initialized. + */ + if (q->elevator) + elv_drain_elevator(q); + + blkcg_drain_queue(q); /* * This function might be called on a queue which failed - * driver init after queue creation. Some drivers - * (e.g. fd) get unhappy in such cases. Kick queue iff - * dispatch queue has something on it. + * driver init after queue creation or is not yet fully + * active yet. Some drivers (e.g. fd and loop) get unhappy + * in such cases. Kick queue iff dispatch queue has + * something on it and @q has request_fn set. */ - if (!list_empty(&q->queue_head)) + if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); drain |= q->rq.elvpriv; @@ -403,6 +410,42 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) } /** + * blk_queue_bypass_start - enter queue bypass mode + * @q: queue of interest + * + * In bypass mode, only the dispatch FIFO queue of @q is used. This + * function makes @q enter bypass mode and drains all requests which were + * throttled or issued before. On return, it's guaranteed that no request + * is being throttled or has ELVPRIV set. + */ +void blk_queue_bypass_start(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + spin_unlock_irq(q->queue_lock); + + blk_drain_queue(q, false); +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_start); + +/** + * blk_queue_bypass_end - leave queue bypass mode + * @q: queue of interest + * + * Leave bypass mode and restore the normal queueing behavior. + */ +void blk_queue_bypass_end(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + if (!--q->bypass_depth) + queue_flag_clear(QUEUE_FLAG_BYPASS, q); + WARN_ON_ONCE(q->bypass_depth < 0); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_end); + +/** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * @@ -418,6 +461,11 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); spin_lock_irq(lock); + + /* dead queue is permanently in bypass mode till released */ + q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + queue_flag_set(QUEUE_FLAG_NOMERGES, q); queue_flag_set(QUEUE_FLAG_NOXMERGES, q); queue_flag_set(QUEUE_FLAG_DEAD, q); @@ -428,13 +476,8 @@ void blk_cleanup_queue(struct request_queue *q) spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); - /* - * Drain all requests queued before DEAD marking. The caller might - * be trying to tear down @q before its elevator is initialized, in - * which case we don't want to call into draining. - */ - if (q->elevator) - blk_drain_queue(q, true); + /* drain all requests queued before DEAD marking */ + blk_drain_queue(q, true); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); @@ -498,14 +541,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (err) goto fail_id; - if (blk_throtl_init(q)) - goto fail_id; - setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); +#ifdef CONFIG_BLK_CGROUP + INIT_LIST_HEAD(&q->blkg_list); +#endif INIT_LIST_HEAD(&q->flush_queue[0]); INIT_LIST_HEAD(&q->flush_queue[1]); INIT_LIST_HEAD(&q->flush_data_in_flight); @@ -522,6 +566,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) */ q->queue_lock = &q->__queue_lock; + if (blkcg_init_queue(q)) + goto fail_id; + return q; fail_id: @@ -649,7 +696,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) } static struct request * -blk_alloc_request(struct request_queue *q, struct io_cq *icq, +blk_alloc_request(struct request_queue *q, struct bio *bio, struct io_cq *icq, unsigned int flags, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -663,7 +710,7 @@ blk_alloc_request(struct request_queue *q, struct io_cq *icq, if (flags & REQ_ELVPRIV) { rq->elv.icq = icq; - if (unlikely(elv_set_request(q, rq, gfp_mask))) { + if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { mempool_free(rq, q->rq.rq_pool); return NULL; } @@ -763,6 +810,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio) } /** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + +/** * get_request - get a free request * @q: request_queue to allocate request from * @rw_flags: RW and SYNC flags @@ -779,7 +842,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) { - struct request *rq = NULL; + struct request *rq; struct request_list *rl = &q->rq; struct elevator_type *et; struct io_context *ioc; @@ -789,7 +852,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, int may_queue; retry: et = q->elevator->type; - ioc = current->io_context; + ioc = rq_ioc(bio); if (unlikely(blk_queue_dead(q))) return NULL; @@ -808,7 +871,7 @@ retry: */ if (!ioc && !retried) { spin_unlock_irq(q->queue_lock); - create_io_context(current, gfp_mask, q->node); + create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); retried = true; goto retry; @@ -831,7 +894,7 @@ retry: * process is not a "batcher", and not * exempted by the IO scheduler */ - goto out; + return NULL; } } } @@ -844,7 +907,7 @@ retry: * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - goto out; + return NULL; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -859,8 +922,7 @@ retry: * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. */ - if (blk_rq_should_init_elevator(bio) && - !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { + if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; rl->elvpriv++; if (et->icq_cache && ioc) @@ -873,38 +935,18 @@ retry: /* create icq if missing */ if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { - icq = ioc_create_icq(q, gfp_mask); + create_io_context(gfp_mask, q->node); + ioc = rq_ioc(bio); + if (!ioc) + goto fail_alloc; + icq = ioc_create_icq(ioc, q, gfp_mask); if (!icq) - goto fail_icq; + goto fail_alloc; } - rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); - -fail_icq: - if (unlikely(!rq)) { - /* - * Allocation failed presumably due to memory. Undo anything - * we might have messed up. - * - * Allocating task should really be put onto the front of the - * wait queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); - - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved - * so that freeing of a request in the other direction will - * notice us. another possible fix would be to split the - * rq mempool into READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - - goto out; - } + rq = blk_alloc_request(q, bio, icq, rw_flags, gfp_mask); + if (unlikely(!rq)) + goto fail_alloc; /* * ioc may be NULL here, and ioc_batching will be false. That's @@ -916,8 +958,30 @@ rq_starved: ioc->nr_batch_requests--; trace_block_getrq(q, bio, rw_flags & 1); -out: return rq; + +fail_alloc: + /* + * Allocation failed presumably due to memory. Undo anything we + * might have messed up. + * + * Allocating task should really be put onto the front of the wait + * queue, but this is pretty rare. + */ + spin_lock_irq(q->queue_lock); + freed_request(q, rw_flags); + + /* + * in the very unlikely event that allocation failed and no + * requests for this direction was pending, mark us starved so that + * freeing of a request in the other direction will notice + * us. another possible fix would be to split the rq mempool into + * READ and WRITE + */ +rq_starved: + if (unlikely(rl->count[is_sync] == 0)) + rl->starved[is_sync] = 1; + return NULL; } /** @@ -961,7 +1025,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ - create_io_context(current, GFP_NOIO, q->node); + create_io_context(GFP_NOIO, q->node); ioc_set_batching(q, current->io_context); spin_lock_irq(q->queue_lock); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index fb95dd2f889a..1e2d53b04858 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc) } EXPORT_SYMBOL(put_io_context); -/* Called by the exiting task */ -void exit_io_context(struct task_struct *task) +/** + * put_io_context_active - put active reference on ioc + * @ioc: ioc of interest + * + * Undo get_io_context_active(). If active reference reaches zero after + * put, @ioc can never issue further IOs and ioscheds are notified. + */ +void put_io_context_active(struct io_context *ioc) { - struct io_context *ioc; - struct io_cq *icq; struct hlist_node *n; unsigned long flags; + struct io_cq *icq; - task_lock(task); - ioc = task->io_context; - task->io_context = NULL; - task_unlock(task); - - if (!atomic_dec_and_test(&ioc->nr_tasks)) { + if (!atomic_dec_and_test(&ioc->active_ref)) { put_io_context(ioc); return; } @@ -197,6 +197,20 @@ retry: put_io_context(ioc); } +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) +{ + struct io_context *ioc; + + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + atomic_dec(&ioc->nr_tasks); + put_io_context_active(ioc); +} + /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared @@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q) } } -void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, - int node) +int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return; + return -ENOMEM; /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); + atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->icq_list); @@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, else kmem_cache_free(iocontext_cachep, ioc); task_unlock(task); + + return 0; } /** @@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task, return ioc; } task_unlock(task); - } while (create_io_context(task, gfp_flags, node)); + } while (!create_task_io_context(task, gfp_flags, node)); return NULL; } @@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq + * @ioc: io_context of interest * @q: request_queue of interest * @gfp_mask: allocation mask * - * Make sure io_cq linking %current->io_context and @q exists. If either - * io_context and/or icq don't exist, they will be created using @gfp_mask. + * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they + * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; - struct io_context *ioc; struct io_cq *icq; /* allocate stuff */ - ioc = create_io_context(current, gfp_mask, q->node); - if (!ioc) - return NULL; - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, q->node); if (!icq) @@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) return icq; } -void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags) -{ - struct io_cq *icq; - struct hlist_node *n; - - hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) - icq->flags |= flags; -} - -/** - * ioc_ioprio_changed - notify ioprio change - * @ioc: io_context of interest - * @ioprio: new ioprio - * - * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all - * icq's. iosched is responsible for checking the bit and applying it on - * request issue path. - */ -void ioc_ioprio_changed(struct io_context *ioc, int ioprio) -{ - unsigned long flags; - - spin_lock_irqsave(&ioc->lock, flags); - ioc->ioprio = ioprio; - ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED); - spin_unlock_irqrestore(&ioc->lock, flags); -} - -/** - * ioc_cgroup_changed - notify cgroup change - * @ioc: io_context of interest - * - * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's. - * iosched is responsible for checking the bit and applying it on request - * issue path. - */ -void ioc_cgroup_changed(struct io_context *ioc) -{ - unsigned long flags; - - spin_lock_irqsave(&ioc->lock, flags); - ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED); - spin_unlock_irqrestore(&ioc->lock, flags); -} -EXPORT_SYMBOL(ioc_cgroup_changed); - -/** - * icq_get_changed - fetch and clear icq changed mask - * @icq: icq of interest - * - * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases - * @icq->ioc->lock. - */ -unsigned icq_get_changed(struct io_cq *icq) -{ - unsigned int changed = 0; - unsigned long flags; - - if (unlikely(icq->flags & ICQ_CHANGED_MASK)) { - spin_lock_irqsave(&icq->ioc->lock, flags); - changed = icq->flags & ICQ_CHANGED_MASK; - icq->flags &= ~ICQ_CHANGED_MASK; - spin_unlock_irqrestore(&icq->ioc->lock, flags); - } - return changed; -} -EXPORT_SYMBOL(icq_get_changed); - static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cf150011d808..aa41b47c22d2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -9,6 +9,7 @@ #include <linux/blktrace_api.h> #include "blk.h" +#include "blk-cgroup.h" struct queue_sysfs_entry { struct attribute attr; @@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); + blkcg_exit_queue(q); + if (q->elevator) { spin_lock_irq(q->queue_lock); ioc_clear_queue(q); @@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } - blk_throtl_exit(q); - if (rl->rq_pool) mempool_destroy(rl->rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); - blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5eed6a76721d..4ba141820a2e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -21,6 +21,8 @@ static int throtl_quantum = 32; /* Throttling is performed over 100ms slice and after that slice is renewed */ static unsigned long throtl_slice = HZ/10; /* 100 ms */ +static struct blkio_policy_type blkio_policy_throtl; + /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; static void throtl_schedule_delayed_work(struct throtl_data *td, @@ -39,9 +41,6 @@ struct throtl_rb_root { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_grp { - /* List of throtl groups on the request queue*/ - struct hlist_node tg_node; - /* active throtl group service_tree member */ struct rb_node rb_node; @@ -52,8 +51,6 @@ struct throtl_grp { */ unsigned long disptime; - struct blkio_group blkg; - atomic_t ref; unsigned int flags; /* Two lists for READ and WRITE */ @@ -79,15 +76,10 @@ struct throtl_grp { /* Some throttle limits got updated for the group */ int limits_changed; - - struct rcu_head rcu_head; }; struct throtl_data { - /* List of throtl groups */ - struct hlist_head tg_list; - /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; @@ -108,6 +100,16 @@ struct throtl_data int limits_changed; }; +static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg) +{ + return blkg_to_pdata(blkg, &blkio_policy_throtl); +} + +static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg) +{ + return pdata_to_blkg(tg, &blkio_policy_throtl); +} + enum tg_state_flags { THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ }; @@ -130,242 +132,68 @@ THROTL_TG_FNS(on_rr); #define throtl_log_tg(td, tg, fmt, args...) \ blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ - blkg_path(&(tg)->blkg), ##args); \ + blkg_path(tg_to_blkg(tg)), ##args); \ #define throtl_log(td, fmt, args...) \ blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) -static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) -{ - if (blkg) - return container_of(blkg, struct throtl_grp, blkg); - - return NULL; -} - static inline unsigned int total_nr_queued(struct throtl_data *td) { return td->nr_queued[0] + td->nr_queued[1]; } -static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +static void throtl_init_blkio_group(struct blkio_group *blkg) { - atomic_inc(&tg->ref); - return tg; -} - -static void throtl_free_tg(struct rcu_head *head) -{ - struct throtl_grp *tg; - - tg = container_of(head, struct throtl_grp, rcu_head); - free_percpu(tg->blkg.stats_cpu); - kfree(tg); -} + struct throtl_grp *tg = blkg_to_tg(blkg); -static void throtl_put_tg(struct throtl_grp *tg) -{ - BUG_ON(atomic_read(&tg->ref) <= 0); - if (!atomic_dec_and_test(&tg->ref)) - return; - - /* - * A group is freed in rcu manner. But having an rcu lock does not - * mean that one can access all the fields of blkg and assume these - * are valid. For example, don't try to follow throtl_data and - * request queue links. - * - * Having a reference to blkg under an rcu allows acess to only - * values local to groups like group stats and group rate limits - */ - call_rcu(&tg->rcu_head, throtl_free_tg); -} - -static void throtl_init_group(struct throtl_grp *tg) -{ - INIT_HLIST_NODE(&tg->tg_node); RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); tg->limits_changed = false; - /* Practically unlimited BW */ - tg->bps[0] = tg->bps[1] = -1; - tg->iops[0] = tg->iops[1] = -1; - - /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * request queue which will be dropped by either request queue - * exit or cgroup deletion path depending on who is exiting first. - */ - atomic_set(&tg->ref, 1); -} - -/* Should be called with rcu read lock held (needed for blkcg) */ -static void -throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) -{ - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; -} - -static void -__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) -{ - struct backing_dev_info *bdi = &td->queue->backing_dev_info; - unsigned int major, minor; - - if (!tg || tg->blkg.dev) - return; - - /* - * Fill in device details for a group which might not have been - * filled at group creation time as queue was being instantiated - * and driver had not attached a device yet - */ - if (bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - tg->blkg.dev = MKDEV(major, minor); - } -} - -/* - * Should be called with without queue lock held. Here queue lock will be - * taken rarely. It will be taken only once during life time of a group - * if need be - */ -static void -throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) -{ - if (!tg || tg->blkg.dev) - return; - - spin_lock_irq(td->queue->queue_lock); - __throtl_tg_fill_dev_details(td, tg); - spin_unlock_irq(td->queue->queue_lock); -} - -static void throtl_init_add_tg_lists(struct throtl_data *td, - struct throtl_grp *tg, struct blkio_cgroup *blkcg) -{ - __throtl_tg_fill_dev_details(td, tg); - - /* Add group onto cgroup list */ - blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, - tg->blkg.dev, BLKIO_POLICY_THROTL); - - tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); - tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); - tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); - tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); - - throtl_add_group_to_td_list(td, tg); -} - -/* Should be called without queue lock and outside of rcu period */ -static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) -{ - struct throtl_grp *tg = NULL; - int ret; - - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); - if (!tg) - return NULL; - - ret = blkio_alloc_blkg_stats(&tg->blkg); - - if (ret) { - kfree(tg); - return NULL; - } - - throtl_init_group(tg); - return tg; + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; } static struct -throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) +throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) { - struct throtl_grp *tg = NULL; - void *key = td; - /* * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ + * Avoid lookup in this case + */ if (blkcg == &blkio_root_cgroup) - tg = td->root_tg; - else - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + return td->root_tg; - __throtl_tg_fill_dev_details(td, tg); - return tg; + return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } -static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, + struct blkio_cgroup *blkcg) { - struct throtl_grp *tg = NULL, *__tg = NULL; - struct blkio_cgroup *blkcg; struct request_queue *q = td->queue; - - /* no throttling for dead queue */ - if (unlikely(blk_queue_dead(q))) - return NULL; - - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - tg = throtl_find_tg(td, blkcg); - if (tg) { - rcu_read_unlock(); - return tg; - } - - /* - * Need to allocate a group. Allocation of group also needs allocation - * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc. - */ - rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); - - tg = throtl_alloc_tg(td); - - /* Group allocated and queue is still alive. take the lock */ - spin_lock_irq(q->queue_lock); - - /* Make sure @q is still alive */ - if (unlikely(blk_queue_dead(q))) { - kfree(tg); - return NULL; - } - - /* - * Initialize the new group. After sleeping, read the blkcg again. - */ - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); + struct throtl_grp *tg = NULL; /* - * If some other thread already allocated the group while we were - * not holding queue lock, free up the group + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case */ - __tg = throtl_find_tg(td, blkcg); + if (blkcg == &blkio_root_cgroup) { + tg = td->root_tg; + } else { + struct blkio_group *blkg; - if (__tg) { - kfree(tg); - rcu_read_unlock(); - return __tg; - } + blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false); - /* Group allocation failed. Account the IO to root group */ - if (!tg) { - tg = td->root_tg; - return tg; + /* if %NULL and @q is alive, fall back to root_tg */ + if (!IS_ERR(blkg)) + tg = blkg_to_tg(blkg); + else if (!blk_queue_dead(q)) + tg = td->root_tg; } - throtl_init_add_tg_lists(td, tg, blkcg); - rcu_read_unlock(); return tg; } @@ -743,7 +571,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); + blkiocg_update_dispatch_stats(tg_to_blkg(tg), &blkio_policy_throtl, + bio->bi_size, rw, sync); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -753,7 +582,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, bio_list_add(&tg->bio_lists[rw], bio); /* Take a bio reference on tg */ - throtl_ref_get_tg(tg); + blkg_get(tg_to_blkg(tg)); tg->nr_queued[rw]++; td->nr_queued[rw]++; throtl_enqueue_tg(td, tg); @@ -786,8 +615,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, bio = bio_list_pop(&tg->bio_lists[rw]); tg->nr_queued[rw]--; - /* Drop bio reference on tg */ - throtl_put_tg(tg); + /* Drop bio reference on blkg */ + blkg_put(tg_to_blkg(tg)); BUG_ON(td->nr_queued[rw] <= 0); td->nr_queued[rw]--; @@ -865,8 +694,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) static void throtl_process_limit_change(struct throtl_data *td) { - struct throtl_grp *tg; - struct hlist_node *pos, *n; + struct request_queue *q = td->queue; + struct blkio_group *blkg, *n; if (!td->limits_changed) return; @@ -875,7 +704,9 @@ static void throtl_process_limit_change(struct throtl_data *td) throtl_log(td, "limits changed"); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct throtl_grp *tg = blkg_to_tg(blkg); + if (!tg->limits_changed) continue; @@ -973,62 +804,6 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) } } -static void -throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) -{ - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&tg->tg_node)); - - hlist_del_init(&tg->tg_node); - - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - throtl_put_tg(tg); - td->nr_undestroyed_grps--; -} - -static void throtl_release_tgs(struct throtl_data *td) -{ - struct hlist_node *pos, *n; - struct throtl_grp *tg; - - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!blkiocg_del_blkio_group(&tg->blkg)) - throtl_destroy_tg(td, tg); - } -} - -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid throtl_data pointer as long as we are - * rcu read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if queue was going away, cgroup deltion - * path got to it first. - */ -void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) -{ - unsigned long flags; - struct throtl_data *td = key; - - spin_lock_irqsave(td->queue->queue_lock, flags); - throtl_destroy_tg(td, tg_of_blkg(blkg)); - spin_unlock_irqrestore(td->queue->queue_lock, flags); -} - static void throtl_update_blkio_group_common(struct throtl_data *td, struct throtl_grp *tg) { @@ -1039,52 +814,48 @@ static void throtl_update_blkio_group_common(struct throtl_data *td, } /* - * For all update functions, key should be a valid pointer because these + * For all update functions, @q should be a valid pointer because these * update functions are called under blkcg_lock, that means, blkg is - * valid and in turn key is valid. queue exit path can not race because + * valid and in turn @q is valid. queue exit path can not race because * of blkcg_lock * * Can not take queue lock in update functions as queue lock under blkcg_lock * is not allowed. Under other paths we take blkcg_lock under queue_lock. */ -static void throtl_update_blkio_group_read_bps(void *key, +static void throtl_update_blkio_group_read_bps(struct request_queue *q, struct blkio_group *blkg, u64 read_bps) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); tg->bps[READ] = read_bps; - throtl_update_blkio_group_common(td, tg); + throtl_update_blkio_group_common(q->td, tg); } -static void throtl_update_blkio_group_write_bps(void *key, +static void throtl_update_blkio_group_write_bps(struct request_queue *q, struct blkio_group *blkg, u64 write_bps) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); tg->bps[WRITE] = write_bps; - throtl_update_blkio_group_common(td, tg); + throtl_update_blkio_group_common(q->td, tg); } -static void throtl_update_blkio_group_read_iops(void *key, +static void throtl_update_blkio_group_read_iops(struct request_queue *q, struct blkio_group *blkg, unsigned int read_iops) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); tg->iops[READ] = read_iops; - throtl_update_blkio_group_common(td, tg); + throtl_update_blkio_group_common(q->td, tg); } -static void throtl_update_blkio_group_write_iops(void *key, +static void throtl_update_blkio_group_write_iops(struct request_queue *q, struct blkio_group *blkg, unsigned int write_iops) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); tg->iops[WRITE] = write_iops; - throtl_update_blkio_group_common(td, tg); + throtl_update_blkio_group_common(q->td, tg); } static void throtl_shutdown_wq(struct request_queue *q) @@ -1096,7 +867,7 @@ static void throtl_shutdown_wq(struct request_queue *q) static struct blkio_policy_type blkio_policy_throtl = { .ops = { - .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_init_group_fn = throtl_init_blkio_group, .blkio_update_group_read_bps_fn = throtl_update_blkio_group_read_bps, .blkio_update_group_write_bps_fn = @@ -1107,6 +878,7 @@ static struct blkio_policy_type blkio_policy_throtl = { throtl_update_blkio_group_write_iops, }, .plid = BLKIO_POLICY_THROTL, + .pdata_size = sizeof(struct throtl_grp), }; bool blk_throtl_bio(struct request_queue *q, struct bio *bio) @@ -1122,33 +894,33 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) goto out; } + /* bio_associate_current() needs ioc, try creating */ + create_io_context(GFP_ATOMIC, q->node); + /* * A throtl_grp pointer retrieved under rcu can be used to access * basic fields like stats and io rates. If a group has no rules, * just update the dispatch stats in lockless manner and return. */ - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - tg = throtl_find_tg(td, blkcg); + blkcg = bio_blkio_cgroup(bio); + tg = throtl_lookup_tg(td, blkcg); if (tg) { - throtl_tg_fill_dev_details(td, tg); - if (tg_no_rule_group(tg, rw)) { - blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, - rw, rw_is_sync(bio->bi_rw)); - rcu_read_unlock(); - goto out; + blkiocg_update_dispatch_stats(tg_to_blkg(tg), + &blkio_policy_throtl, + bio->bi_size, rw, + rw_is_sync(bio->bi_rw)); + goto out_unlock_rcu; } } - rcu_read_unlock(); /* * Either group has not been allocated yet or it is not an unlimited * IO group */ spin_lock_irq(q->queue_lock); - tg = throtl_get_tg(td); + tg = throtl_lookup_create_tg(td, blkcg); if (unlikely(!tg)) goto out_unlock; @@ -1189,6 +961,7 @@ queue_bio: tg->io_disp[rw], tg->iops[rw], tg->nr_queued[READ], tg->nr_queued[WRITE]); + bio_associate_current(bio); throtl_add_bio_tg(q->td, tg, bio); throttled = true; @@ -1199,6 +972,8 @@ queue_bio: out_unlock: spin_unlock_irq(q->queue_lock); +out_unlock_rcu: + rcu_read_unlock(); out: return throttled; } @@ -1241,79 +1016,42 @@ void blk_throtl_drain(struct request_queue *q) int blk_throtl_init(struct request_queue *q) { struct throtl_data *td; - struct throtl_grp *tg; + struct blkio_group *blkg; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; td->limits_changed = false; INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); - /* alloc and Init root group. */ + q->td = td; td->queue = q; - tg = throtl_alloc_tg(td); - if (!tg) { - kfree(td); - return -ENOMEM; - } + /* alloc and init root group. */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); - td->root_tg = tg; + blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL, + true); + if (!IS_ERR(blkg)) + td->root_tg = blkg_to_tg(blkg); - rcu_read_lock(); - throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); + spin_unlock_irq(q->queue_lock); rcu_read_unlock(); - /* Attach throtl data to request queue */ - q->td = td; + if (!td->root_tg) { + kfree(td); + return -ENOMEM; + } return 0; } void blk_throtl_exit(struct request_queue *q) { - struct throtl_data *td = q->td; - bool wait = false; - - BUG_ON(!td); - + BUG_ON(!q->td); throtl_shutdown_wq(q); - - spin_lock_irq(q->queue_lock); - throtl_release_tgs(td); - - /* If there are other groups */ - if (td->nr_undestroyed_grps > 0) - wait = true; - - spin_unlock_irq(q->queue_lock); - - /* - * Wait for tg->blkg->key accessors to exit their grace periods. - * Do this wait only if there are other undestroyed groups out - * there (other than root group). This can happen if cgroup deletion - * path claimed the responsibility of cleaning up a group before - * queue cleanup code get to the group. - * - * Do not call synchronize_rcu() unconditionally as there are drivers - * which create/delete request queue hundreds of times during scan/boot - * and synchronize_rcu() can take significant time and slow down boot. - */ - if (wait) - synchronize_rcu(); - - /* - * Just being safe to make sure after previous flush if some body did - * update limits through cgroup and another work got queued, cancel - * it. - */ - throtl_shutdown_wq(q); -} - -void blk_throtl_release(struct request_queue *q) -{ kfree(q->td); } diff --git a/block/blk.h b/block/blk.h index d45be871329e..85f6ae42f7d3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_drain_queue(struct request_queue *q, bool drain_all); +void blk_queue_bypass_start(struct request_queue *q); +void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, @@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); - /* * Return the threshold (number of used requests) at which the queue is @@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq) */ void get_io_context(struct io_context *ioc); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); -void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, - int node); +int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /** * create_io_context - try to create task->io_context - * @task: target task * @gfp_mask: allocation mask * @node: allocation node * - * If @task->io_context is %NULL, allocate a new io_context and install it. - * Returns the current @task->io_context which may be %NULL if allocation - * failed. + * If %current->io_context is %NULL, allocate a new io_context and install + * it. Returns the current %current->io_context which may be %NULL if + * allocation failed. * * Note that this function can't be called with IRQ disabled because - * task_lock which protects @task->io_context is IRQ-unsafe. + * task_lock which protects %current->io_context is IRQ-unsafe. */ -static inline struct io_context *create_io_context(struct task_struct *task, - gfp_t gfp_mask, int node) +static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) { WARN_ON_ONCE(irqs_disabled()); - if (unlikely(!task->io_context)) - create_io_context_slowpath(task, gfp_mask, node); - return task->io_context; + if (unlikely(!current->io_context)) + create_task_io_context(current, gfp_mask, node); + return current->io_context; } /* @@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_release(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { @@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_release(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif /* BLK_INTERNAL_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 457295253566..39c43307dc6c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -17,6 +17,8 @@ #include "blk.h" #include "cfq.h" +static struct blkio_policy_type blkio_policy_cfq; + /* * tunables */ @@ -206,11 +208,7 @@ struct cfq_group { unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; - struct blkio_group blkg; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - struct hlist_node cfqd_node; - int ref; -#endif + /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; @@ -220,6 +218,10 @@ struct cfq_io_cq { struct io_cq icq; /* must be the first member */ struct cfq_queue *cfqq[2]; struct cfq_ttime ttime; + int ioprio; /* the current ioprio */ +#ifdef CONFIG_CFQ_GROUP_IOSCHED + uint64_t blkcg_id; /* the current blkcg ID */ +#endif }; /* @@ -229,7 +231,7 @@ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; - struct cfq_group root_group; + struct cfq_group *root_group; /* * The priority currently being served @@ -302,12 +304,6 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; - - /* List of cfq groups being managed on this device*/ - struct hlist_head cfqg_list; - - /* Number of groups which are on blkcg->blkg_list */ - unsigned int nr_blkcg_linked_grps; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -371,20 +367,48 @@ CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS #ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) +{ + return blkg_to_pdata(blkg, &blkio_policy_cfq); +} + +static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pdata_to_blkg(cfqg, &blkio_policy_cfq); +} + +static inline void cfqg_get(struct cfq_group *cfqg) +{ + return blkg_get(cfqg_to_blkg(cfqg)); +} + +static inline void cfqg_put(struct cfq_group *cfqg) +{ + return blkg_put(cfqg_to_blkg(cfqg)); +} + #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args) + blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args) \ + blkg_path(cfqg_to_blkg((cfqg))), ##args) \ + +#else /* CONFIG_CFQ_GROUP_IOSCHED */ + +static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; } +static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; } +static inline void cfqg_get(struct cfq_group *cfqg) { } +static inline void cfqg_put(struct cfq_group *cfqg) { } -#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) -#endif + +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -465,8 +489,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, } static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, - struct io_context *, gfp_t); +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, + struct cfq_io_cq *cic, struct bio *bio, + gfp_t gfp_mask); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -935,7 +960,8 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); cfq_group_service_tree_del(st, cfqg); cfqg->saved_workload_slice = 0; - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); + cfq_blkiocg_update_dequeue_stats(cfqg_to_blkg(cfqg), + &blkio_policy_cfq, 1); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, @@ -1007,178 +1033,70 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd), cfqq->nr_sectors); - cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, - unaccounted_sl); - cfq_blkiocg_set_start_empty_time(&cfqg->blkg); -} - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) -{ - if (blkg) - return container_of(blkg, struct cfq_group, blkg); - return NULL; -} - -static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, - unsigned int weight) -{ - struct cfq_group *cfqg = cfqg_of_blkg(blkg); - cfqg->new_weight = weight; - cfqg->needs_update = true; + cfq_blkiocg_update_timeslice_used(cfqg_to_blkg(cfqg), &blkio_policy_cfq, + used_sl, unaccounted_sl); + cfq_blkiocg_set_start_empty_time(cfqg_to_blkg(cfqg), &blkio_policy_cfq); } -static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, - struct cfq_group *cfqg, struct blkio_cgroup *blkcg) -{ - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; - unsigned int major, minor; - - /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initialized yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. - */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, - (void *)cfqd, MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, - (void *)cfqd, 0); - - cfqd->nr_blkcg_linked_grps++; - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); - - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); -} - -/* - * Should be called from sleepable context. No request queue lock as per - * cpu stats are allocated dynamically and alloc_percpu needs to be called - * from sleepable context. +/** + * cfq_init_cfqg_base - initialize base part of a cfq_group + * @cfqg: cfq_group to initialize + * + * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED + * is enabled or not. */ -static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) +static void cfq_init_cfqg_base(struct cfq_group *cfqg) { - struct cfq_group *cfqg = NULL; - int i, j, ret; struct cfq_rb_root *st; - - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); - if (!cfqg) - return NULL; + int i, j; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); cfqg->ttime.last_end_request = jiffies; - - /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * elevator which will be dropped by either elevator exit - * or cgroup deletion path depending on who is exiting first. - */ - cfqg->ref = 1; - - ret = blkio_alloc_blkg_stats(&cfqg->blkg); - if (ret) { - kfree(cfqg); - return NULL; - } - - return cfqg; } -static struct cfq_group * -cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void cfq_update_blkio_group_weight(struct request_queue *q, + struct blkio_group *blkg, + unsigned int weight) { - struct cfq_group *cfqg = NULL; - void *key = cfqd; - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; - unsigned int major, minor; + struct cfq_group *cfqg = blkg_to_cfqg(blkg); - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - cfqg = &cfqd->root_group; - else - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + cfqg->new_weight = weight; + cfqg->needs_update = true; +} - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - } +static void cfq_init_blkio_group(struct blkio_group *blkg) +{ + struct cfq_group *cfqg = blkg_to_cfqg(blkg); - return cfqg; + cfq_init_cfqg_base(cfqg); + cfqg->weight = blkg->blkcg->weight; } /* * Search for the cfq group current task belongs to. request_queue lock must * be held. */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkio_cgroup *blkcg) { - struct blkio_cgroup *blkcg; - struct cfq_group *cfqg = NULL, *__cfqg = NULL; struct request_queue *q = cfqd->queue; + struct cfq_group *cfqg = NULL; - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - cfqg = cfq_find_cfqg(cfqd, blkcg); - if (cfqg) { - rcu_read_unlock(); - return cfqg; - } - - /* - * Need to allocate a group. Allocation of group also needs allocation - * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc. - * - * Not taking any queue reference here and assuming that queue is - * around by the time we return. CFQ queue allocation code does - * the same. It might be racy though. - */ - - rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); - - cfqg = cfq_alloc_cfqg(cfqd); - - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - - /* - * If some other thread already allocated the group while we were - * not holding queue lock, free up the group - */ - __cfqg = cfq_find_cfqg(cfqd, blkcg); + /* avoid lookup for the common case where there's no blkio cgroup */ + if (blkcg == &blkio_root_cgroup) { + cfqg = cfqd->root_group; + } else { + struct blkio_group *blkg; - if (__cfqg) { - kfree(cfqg); - rcu_read_unlock(); - return __cfqg; + blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_PROP, false); + if (!IS_ERR(blkg)) + cfqg = blkg_to_cfqg(blkg); } - if (!cfqg) - cfqg = &cfqd->root_group; - - cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); - rcu_read_unlock(); - return cfqg; -} - -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) -{ - cfqg->ref++; return cfqg; } @@ -1186,94 +1104,18 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ if (!cfq_cfqq_sync(cfqq)) - cfqg = &cfqq->cfqd->root_group; + cfqg = cfqq->cfqd->root_group; cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - cfqq->cfqg->ref++; -} - -static void cfq_put_cfqg(struct cfq_group *cfqg) -{ - struct cfq_rb_root *st; - int i, j; - - BUG_ON(cfqg->ref <= 0); - cfqg->ref--; - if (cfqg->ref) - return; - for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb)); - free_percpu(cfqg->blkg.stats_cpu); - kfree(cfqg); -} - -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); - - hlist_del_init(&cfqg->cfqd_node); - - BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); - cfqd->nr_blkcg_linked_grps--; - - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - cfq_put_cfqg(cfqg); -} - -static void cfq_release_cfq_groups(struct cfq_data *cfqd) -{ - struct hlist_node *pos, *n; - struct cfq_group *cfqg; - - hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) - cfq_destroy_cfqg(cfqd, cfqg); - } -} - -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu - * read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if elevator was exiting, cgroup deltion - * path got to it first. - */ -static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) -{ - unsigned long flags; - struct cfq_data *cfqd = key; - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + cfqg_get(cfqg); } #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkio_cgroup *blkcg) { - return &cfqd->root_group; -} - -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) -{ - return cfqg; + return cfqd->root_group; } static inline void @@ -1281,9 +1123,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; } -static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} -static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} - #endif /* GROUP_IOSCHED */ /* @@ -1550,12 +1389,14 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); + cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)), + &blkio_policy_cfq, rq_data_dir(rq), + rq_is_sync(rq)); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)), + &blkio_policy_cfq, + cfqg_to_blkg(cfqq->cfqd->serving_group), + rq_data_dir(rq), rq_is_sync(rq)); } static struct request * @@ -1611,8 +1452,9 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); + cfq_blkiocg_update_io_remove_stats(cfqg_to_blkg(RQ_CFQG(rq)), + &blkio_policy_cfq, rq_data_dir(rq), + rq_is_sync(rq)); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -1647,8 +1489,9 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, - bio_data_dir(bio), cfq_bio_sync(bio)); + cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(req)), + &blkio_policy_cfq, bio_data_dir(bio), + cfq_bio_sync(bio)); } static void @@ -1670,8 +1513,9 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(next), rq_is_sync(next)); + cfq_blkiocg_update_io_merged_stats(cfqg_to_blkg(RQ_CFQG(rq)), + &blkio_policy_cfq, rq_data_dir(next), + rq_is_sync(next)); cfqq = RQ_CFQQ(next); /* @@ -1712,7 +1556,8 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { del_timer(&cfqd->idle_slice_timer); - cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); + cfq_blkiocg_update_idle_time_stats(cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq); } static void __cfq_set_active_queue(struct cfq_data *cfqd, @@ -1721,7 +1566,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", cfqd->serving_prio, cfqd->serving_type); - cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); + cfq_blkiocg_update_avg_queue_size_stats(cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -2042,7 +1888,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * task has exited, don't wait */ cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) + if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) return; /* @@ -2069,7 +1915,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); + cfq_blkiocg_update_set_idle_time_stats(cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, group_idle ? 1 : 0); } @@ -2092,8 +1939,9 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfq_blkiocg_update_dispatch_stats(cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq, blk_rq_bytes(rq), + rq_data_dir(rq), rq_is_sync(rq)); } /* @@ -2675,7 +2523,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); - cfq_put_cfqg(cfqg); + cfqg_put(cfqg); } static void cfq_put_cooperator(struct cfq_queue *cfqq) @@ -2734,7 +2582,7 @@ static void cfq_exit_icq(struct io_cq *icq) } } -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) +static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) { struct task_struct *tsk = current; int ioprio_class; @@ -2742,7 +2590,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) if (!cfq_cfqq_prio_changed(cfqq)) return; - ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); @@ -2754,11 +2602,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) cfqq->ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: @@ -2776,19 +2624,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) cfq_clear_cfqq_prio_changed(cfqq); } -static void changed_ioprio(struct cfq_io_cq *cic) +static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) { + int ioprio = cic->icq.ioc->ioprio; struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; - if (unlikely(!cfqd)) + /* + * Check whether ioprio has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; cfqq = cic->cfqq[BLK_RW_ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, - GFP_ATOMIC); + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, + GFP_ATOMIC); if (new_cfqq) { cic->cfqq[BLK_RW_ASYNC] = new_cfqq; cfq_put_queue(cfqq); @@ -2798,6 +2651,8 @@ static void changed_ioprio(struct cfq_io_cq *cic) cfqq = cic->cfqq[BLK_RW_SYNC]; if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); + + cic->ioprio = ioprio; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -2821,17 +2676,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void changed_cgroup(struct cfq_io_cq *cic) +static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { - struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); struct cfq_data *cfqd = cic_to_cfqd(cic); - struct request_queue *q; + struct cfq_queue *sync_cfqq; + uint64_t id; - if (unlikely(!cfqd)) - return; + rcu_read_lock(); + id = bio_blkio_cgroup(bio)->id; + rcu_read_unlock(); - q = cfqd->queue; + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + return; + sync_cfqq = cic_to_cfqq(cic, 1); if (sync_cfqq) { /* * Drop reference to sync queue. A new sync queue will be @@ -2841,21 +2703,26 @@ static void changed_cgroup(struct cfq_io_cq *cic) cic_set_cfqq(cic, NULL, 1); cfq_put_queue(sync_cfqq); } + + cic->blkcg_id = id; } +#else +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, - struct io_context *ioc, gfp_t gfp_mask) +cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { + struct blkio_cgroup *blkcg; struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_io_cq *cic; struct cfq_group *cfqg; retry: - cfqg = cfq_get_cfqg(cfqd); - cic = cfq_cic_lookup(cfqd, ioc); - /* cic always exists here */ + rcu_read_lock(); + + blkcg = bio_blkio_cgroup(bio); + cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); cfqq = cic_to_cfqq(cic, is_sync); /* @@ -2868,6 +2735,7 @@ retry: cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { + rcu_read_unlock(); spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -2883,7 +2751,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, ioc); + cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else @@ -2893,6 +2761,7 @@ retry: if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); + rcu_read_unlock(); return cfqq; } @@ -2902,6 +2771,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) switch (ioprio_class) { case IOPRIO_CLASS_RT: return &cfqd->async_cfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ case IOPRIO_CLASS_BE: return &cfqd->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: @@ -2912,11 +2784,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, - gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { - const int ioprio = task_ioprio(ioc); - const int ioprio_class = task_ioprio_class(ioc); + const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; @@ -2926,7 +2798,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -3008,7 +2880,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) enable_idle = 0; - else if (!atomic_read(&cic->icq.ioc->nr_tasks) || + else if (!atomic_read(&cic->icq.ioc->active_ref) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; @@ -3173,7 +3045,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, __blk_run_queue(cfqd->queue); } else { cfq_blkiocg_update_idle_time_stats( - &cfqq->cfqg->blkg); + cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq); cfq_mark_cfqq_must_dispatch(cfqq); } } @@ -3195,14 +3068,15 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); + cfq_init_prio_data(cfqq, RQ_CIC(rq)); rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfq_blkiocg_update_io_add_stats(cfqg_to_blkg(RQ_CFQG(rq)), + &blkio_policy_cfq, + cfqg_to_blkg(cfqd->serving_group), + rq_data_dir(rq), rq_is_sync(rq)); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3298,9 +3172,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; - cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, - rq_start_time_ns(rq), rq_io_start_time_ns(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfq_blkiocg_update_completion_stats(cfqg_to_blkg(cfqq->cfqg), + &blkio_policy_cfq, rq_start_time_ns(rq), + rq_io_start_time_ns(rq), rq_data_dir(rq), + rq_is_sync(rq)); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -3397,7 +3272,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { - cfq_init_prio_data(cfqq, cic->icq.ioc); + cfq_init_prio_data(cfqq, cic); return __cfq_may_queue(cfqq); } @@ -3419,7 +3294,7 @@ static void cfq_put_request(struct request *rq) cfqq->allocated[rw]--; /* Put down rq reference on cfqg */ - cfq_put_cfqg(RQ_CFQG(rq)); + cfqg_put(RQ_CFQG(rq)); rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; @@ -3463,32 +3338,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - unsigned int changed; might_sleep_if(gfp_mask & __GFP_WAIT); spin_lock_irq(q->queue_lock); - /* handle changed notifications */ - changed = icq_get_changed(&cic->icq); - if (unlikely(changed & ICQ_IOPRIO_CHANGED)) - changed_ioprio(cic); -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (unlikely(changed & ICQ_CGROUP_CHANGED)) - changed_cgroup(cic); -#endif - + check_ioprio_changed(cic, bio); + check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -3514,8 +3382,9 @@ new_queue: cfqq->allocated[rw]++; cfqq->ref++; + cfqg_get(cfqq->cfqg); rq->elv.priv[0] = cfqq; - rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); + rq->elv.priv[1] = cfqq->cfqg; spin_unlock_irq(q->queue_lock); return 0; } @@ -3612,7 +3481,6 @@ static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; - bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -3622,89 +3490,59 @@ static void cfq_exit_queue(struct elevator_queue *e) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); cfq_put_async_queues(cfqd); - cfq_release_cfq_groups(cfqd); - - /* - * If there are groups which we could not unlink from blkcg list, - * wait for a rcu period for them to be freed. - */ - if (cfqd->nr_blkcg_linked_grps) - wait = true; spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); - /* - * Wait for cfqg->blkg->key accessors to exit their grace periods. - * Do this wait only if there are other unlinked groups out - * there. This can happen if cgroup deletion path claimed the - * responsibility of cleaning up a group before queue cleanup code - * get to the group. - * - * Do not call synchronize_rcu() unconditionally as there are drivers - * which create/delete request queue hundreds of times during scan/boot - * and synchronize_rcu() can take significant time and slow down boot. - */ - if (wait) - synchronize_rcu(); - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* Free up per cpu stats for root group */ - free_percpu(cfqd->root_group.blkg.stats_cpu); +#ifndef CONFIG_CFQ_GROUP_IOSCHED + kfree(cfqd->root_group); #endif + update_root_blkg_pd(q, BLKIO_POLICY_PROP); kfree(cfqd); } -static void *cfq_init_queue(struct request_queue *q) +static int cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - int i, j; - struct cfq_group *cfqg; - struct cfq_rb_root *st; + struct blkio_group *blkg __maybe_unused; + int i; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) - return NULL; + return -ENOMEM; + + cfqd->queue = q; + q->elevator->elevator_data = cfqd; /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; - /* Init root group */ - cfqg = &cfqd->root_group; - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); - - /* Give preference to root group over other groups */ - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; - + /* Init root group and prefer root group over other groups by default */ #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* - * Set root group reference to 2. One reference will be dropped when - * all groups on cfqd->cfqg_list are being deleted during queue exit. - * Other reference will remain there as we don't want to delete this - * group as it is statically allocated and gets destroyed when - * throtl_data goes away. - */ - cfqg->ref = 2; + rcu_read_lock(); + spin_lock_irq(q->queue_lock); - if (blkio_alloc_blkg_stats(&cfqg->blkg)) { - kfree(cfqg); + blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_PROP, + true); + if (!IS_ERR(blkg)) + cfqd->root_group = blkg_to_cfqg(blkg); + + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); +#else + cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), + GFP_KERNEL, cfqd->queue->node); + if (cfqd->root_group) + cfq_init_cfqg_base(cfqd->root_group); +#endif + if (!cfqd->root_group) { kfree(cfqd); - return NULL; + return -ENOMEM; } - rcu_read_lock(); - - cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, - (void *)cfqd, 0); - rcu_read_unlock(); - cfqd->nr_blkcg_linked_grps++; + cfqd->root_group->weight = 2*BLKIO_WEIGHT_DEFAULT; - /* Add group on cfqd->cfqg_list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); -#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -3716,13 +3554,17 @@ static void *cfq_init_queue(struct request_queue *q) /* * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. + * will not attempt to free it. oom_cfqq is linked to root_group + * but shouldn't hold a reference as it'll never be unlinked. Lose + * the reference from linking right away. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); cfqd->oom_cfqq.ref++; - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); - cfqd->queue = q; + spin_lock_irq(q->queue_lock); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); + cfqg_put(cfqd->root_group); + spin_unlock_irq(q->queue_lock); init_timer(&cfqd->idle_slice_timer); cfqd->idle_slice_timer.function = cfq_idle_slice_timer; @@ -3747,7 +3589,7 @@ static void *cfq_init_queue(struct request_queue *q) * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - return cfqd; + return 0; } /* @@ -3873,13 +3715,12 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkio_policy_type blkio_policy_cfq = { .ops = { - .blkio_unlink_group_fn = cfq_unlink_blkio_group, + .blkio_init_group_fn = cfq_init_blkio_group, .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, }, .plid = BLKIO_POLICY_PROP, + .pdata_size = sizeof(struct cfq_group), }; -#else -static struct blkio_policy_type blkio_policy_cfq; #endif static int __init cfq_init(void) @@ -3910,14 +3751,17 @@ static int __init cfq_init(void) return ret; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED blkio_policy_register(&blkio_policy_cfq); - +#endif return 0; } static void __exit cfq_exit(void) { +#ifdef CONFIG_CFQ_GROUP_IOSCHED blkio_policy_unregister(&blkio_policy_cfq); +#endif elv_unregister(&iosched_cfq); kmem_cache_destroy(cfq_pool); } diff --git a/block/cfq.h b/block/cfq.h index 2a155927e37c..c8b15ef57e5d 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -4,112 +4,115 @@ #ifdef CONFIG_CFQ_GROUP_IOSCHED static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, + bool direction, bool sync) { - blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); + blkiocg_update_io_add_stats(blkg, pol, curr_blkg, direction, sync); } static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) + struct blkio_policy_type *pol, unsigned long dequeue) { - blkiocg_update_dequeue_stats(blkg, dequeue); + blkiocg_update_dequeue_stats(blkg, pol, dequeue); } static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, unsigned long unaccounted_time) + struct blkio_policy_type *pol, unsigned long time, + unsigned long unaccounted_time) { - blkiocg_update_timeslice_used(blkg, time, unaccounted_time); + blkiocg_update_timeslice_used(blkg, pol, time, unaccounted_time); } -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - blkiocg_set_start_empty_time(blkg); + blkiocg_set_start_empty_time(blkg, pol); } static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) + struct blkio_policy_type *pol, bool direction, + bool sync) { - blkiocg_update_io_remove_stats(blkg, direction, sync); + blkiocg_update_io_remove_stats(blkg, pol, direction, sync); } static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) + struct blkio_policy_type *pol, bool direction, + bool sync) { - blkiocg_update_io_merged_stats(blkg, direction, sync); + blkiocg_update_io_merged_stats(blkg, pol, direction, sync); } -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - blkiocg_update_idle_time_stats(blkg); + blkiocg_update_idle_time_stats(blkg, pol); } static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - blkiocg_update_avg_queue_size_stats(blkg); + blkiocg_update_avg_queue_size_stats(blkg, pol); } static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - blkiocg_update_set_idle_time_stats(blkg); + blkiocg_update_set_idle_time_stats(blkg, pol); } static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) + struct blkio_policy_type *pol, uint64_t bytes, + bool direction, bool sync) { - blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); + blkiocg_update_dispatch_stats(blkg, pol, bytes, direction, sync); } -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, uint64_t start_time, + uint64_t io_start_time, bool direction, bool sync) { - blkiocg_update_completion_stats(blkg, start_time, io_start_time, - direction, sync); -} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); -} - -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return blkiocg_del_blkio_group(blkg); + blkiocg_update_completion_stats(blkg, pol, start_time, io_start_time, + direction, sync); } #else /* CFQ_GROUP_IOSCHED */ static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} - + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, bool direction, + bool sync) { } static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} - + struct blkio_policy_type *pol, unsigned long dequeue) { } static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, unsigned long unaccounted_time) {} -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} + struct blkio_policy_type *pol, unsigned long time, + unsigned long unaccounted_time) { } +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} + struct blkio_policy_type *pol, bool direction, + bool sync) { } static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ -} + struct blkio_policy_type *pol, bool direction, + bool sync) { } +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { } static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return 0; -} + struct blkio_policy_type *pol, uint64_t bytes, + bool direction, bool sync) { } +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, uint64_t start_time, + uint64_t io_start_time, bool direction, bool sync) { } #endif /* CFQ_GROUP_IOSCHED */ #endif diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 7bf12d793fcd..599b12e5380f 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static void *deadline_init_queue(struct request_queue *q) +static int deadline_init_queue(struct request_queue *q) { struct deadline_data *dd; dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); if (!dd) - return NULL; + return -ENOMEM; INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); @@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; - return dd; + + q->elevator->elevator_data = dd; + return 0; } /* diff --git a/block/elevator.c b/block/elevator.c index f016855a46b0..be3ab6df0fea 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -38,6 +38,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) -{ - eq->elevator_data = eq->type->ops.elevator_init_fn(q); - if (eq->elevator_data) - return 0; - return -ENOMEM; -} - static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) @@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj) int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; - struct elevator_queue *eq; int err; if (unlikely(q->elevator)) @@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name) } } - eq = elevator_alloc(q, e); - if (!eq) + q->elevator = elevator_alloc(q, e); + if (!q->elevator) return -ENOMEM; - err = elevator_init_queue(q, eq); + err = e->ops.elevator_init_fn(q); if (err) { - kobject_put(&eq->kobj); + kobject_put(&q->elevator->kobj); return err; } - q->elevator = eq; return 0; } EXPORT_SYMBOL(elevator_init); @@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q) } } -void elv_quiesce_start(struct request_queue *q) -{ - if (!q->elevator) - return; - - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); - - blk_drain_queue(q, false); -} - -void elv_quiesce_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); -} - void __elv_add_request(struct request_queue *q, struct request *rq, int where) { trace_block_rq_insert(q, rq); @@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); + return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) +int elv_register_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; int error; error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); @@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) } return error; } - -int elv_register_queue(struct request_queue *q) -{ - return __elv_register_queue(q, q->elevator); -} EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) @@ -907,53 +875,62 @@ EXPORT_SYMBOL_GPL(elv_unregister); */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old_elevator, *e; + struct elevator_queue *old = q->elevator; + bool registered = old->registered; int err; - /* allocate new elevator */ - e = elevator_alloc(q, new_e); - if (!e) - return -ENOMEM; + /* + * Turn on BYPASS and drain all requests w/ elevator private data. + * Block layer doesn't call into a quiesced elevator - all requests + * are directly put on the dispatch list without elevator data + * using INSERT_BACK. All requests have SOFTBARRIER set and no + * merge happens either. + */ + blk_queue_bypass_start(q); + + /* unregister and clear all auxiliary data of the old elevator */ + if (registered) + elv_unregister_queue(q); + + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); + + blkg_destroy_all(q, false); - err = elevator_init_queue(q, e); + /* allocate, init and register new elevator */ + err = -ENOMEM; + q->elevator = elevator_alloc(q, new_e); + if (!q->elevator) + goto fail_init; + + err = new_e->ops.elevator_init_fn(q); if (err) { - kobject_put(&e->kobj); - return err; + kobject_put(&q->elevator->kobj); + goto fail_init; } - /* turn on BYPASS and drain all requests w/ elevator private data */ - elv_quiesce_start(q); - - /* unregister old queue, register new one and kill old elevator */ - if (q->elevator->registered) { - elv_unregister_queue(q); - err = __elv_register_queue(q, e); + if (registered) { + err = elv_register_queue(q); if (err) goto fail_register; } - /* done, clear io_cq's, switch elevators and turn off BYPASS */ - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - old_elevator = q->elevator; - q->elevator = e; - spin_unlock_irq(q->queue_lock); - - elevator_exit(old_elevator); - elv_quiesce_end(q); + /* done, kill the old one and finish */ + elevator_exit(old); + blk_queue_bypass_end(q); - blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); return 0; fail_register: - /* - * switch failed, exit the new io scheduler and reattach the old - * one again (along with re-adding the sysfs dir) - */ - elevator_exit(e); + elevator_exit(q->elevator); +fail_init: + /* switch failed, restore and re-register old elevator */ + q->elevator = old; elv_register_queue(q); - elv_quiesce_end(q); + blk_queue_bypass_end(q); return err; } diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 413a0b1d788c..5d1bf70e33d5 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq) return list_entry(rq->queuelist.next, struct request, queuelist); } -static void *noop_init_queue(struct request_queue *q) +static int noop_init_queue(struct request_queue *q) { struct noop_data *nd; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) - return NULL; + return -ENOMEM; + INIT_LIST_HEAD(&nd->queue); - return nd; + q->elevator->elevator_data = nd; + return 0; } static void noop_exit_queue(struct elevator_queue *e) @@ -19,12 +19,14 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> +#include <linux/cgroup.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -418,6 +420,7 @@ void bio_put(struct bio *bio) * last put frees it */ if (atomic_dec_and_test(&bio->bi_cnt)) { + bio_disassociate_task(bio); bio->bi_next = NULL; bio->bi_destructor(bio); } @@ -1641,6 +1644,64 @@ bad: } EXPORT_SYMBOL(bioset_create); +#ifdef CONFIG_BLK_CGROUP +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet. Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released. The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ + struct io_context *ioc; + struct cgroup_subsys_state *css; + + if (bio->bi_ioc) + return -EBUSY; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + /* acquire active ref on @ioc and associate */ + get_io_context_active(ioc); + bio->bi_ioc = ioc; + + /* associate blkcg if exists */ + rcu_read_lock(); + css = task_subsys_state(current, blkio_subsys_id); + if (css && css_tryget(css)) + bio->bi_css = css; + rcu_read_unlock(); + + return 0; +} + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ + if (bio->bi_ioc) { + put_io_context(bio->bi_ioc); + bio->bi_ioc = NULL; + } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } +} + +#endif /* CONFIG_BLK_CGROUP */ + static void __init biovec_init_slabs(void) { int i; diff --git a/fs/ioprio.c b/fs/ioprio.c index 0f1b9515213b..48644373de58 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { - ioc_ioprio_changed(ioc, ioprio); + ioc->ioprio = ioprio; put_io_context(ioc); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 4d94eb8bcbcc..26435890dc87 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -269,6 +269,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +#ifdef CONFIG_BLK_CGROUP +int bio_associate_current(struct bio *bio); +void bio_disassociate_task(struct bio *bio); +#else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } +static inline void bio_disassociate_task(struct bio *bio) { } +#endif /* CONFIG_BLK_CGROUP */ + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4053cbd4490e..0edb65dd8edd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -14,6 +14,8 @@ struct bio; struct bio_integrity_payload; struct page; struct block_device; +struct io_context; +struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -66,6 +68,14 @@ struct bio { bio_end_io_t *bi_end_io; void *bi_private; +#ifdef CONFIG_BLK_CGROUP + /* + * Optional ioc and css associated with this bio. Put on bio + * release. Read comment on top of bio_associate_current(). + */ + struct io_context *bi_ioc; + struct cgroup_subsys_state *bi_css; +#endif #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 606cf339bb56..33f1b29e53f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -362,6 +362,10 @@ struct request_queue { struct list_head timeout_list; struct list_head icq_list; +#ifdef CONFIG_BLK_CGROUP + /* XXX: array size hardcoded to avoid include dependency (temporary) */ + struct list_head blkg_list; +#endif struct queue_limits limits; @@ -389,12 +393,17 @@ struct request_queue { struct mutex sysfs_lock; + int bypass_depth; + #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; int bsg_job_size; struct bsg_class_device bsg_dev; #endif +#ifdef CONFIG_BLK_CGROUP + struct list_head all_q_node; +#endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; @@ -406,7 +415,7 @@ struct request_queue { #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ -#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ +#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ @@ -494,6 +503,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) +#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 7d4e0356f329..c03af7687bb4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -28,12 +28,13 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); +typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, + struct bio *, gfp_t); typedef void (elevator_put_req_fn) (struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); -typedef void *(elevator_init_fn) (struct request_queue *); +typedef int (elevator_init_fn) (struct request_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *); struct elevator_ops @@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *, struct request *, gfp_t); +extern int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1a3018063034..df38db2ef45b 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -6,11 +6,7 @@ #include <linux/workqueue.h> enum { - ICQ_IOPRIO_CHANGED = 1 << 0, - ICQ_CGROUP_CHANGED = 1 << 1, ICQ_EXITED = 1 << 2, - - ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, }; /* @@ -100,6 +96,7 @@ struct io_cq { */ struct io_context { atomic_long_t refcount; + atomic_t active_ref; atomic_t nr_tasks; /* all the fields below are protected by this lock */ @@ -120,29 +117,37 @@ struct io_context { struct work_struct release_work; }; -static inline struct io_context *ioc_task_link(struct io_context *ioc) +/** + * get_io_context_active - get active reference on ioc + * @ioc: ioc of interest + * + * Only iocs with active reference can issue new IOs. This function + * acquires an active reference on @ioc. The caller must already have an + * active reference on @ioc. + */ +static inline void get_io_context_active(struct io_context *ioc) { - /* - * if ref count is zero, don't allow sharing (ioc is going away, it's - * a race). - */ - if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { - atomic_inc(&ioc->nr_tasks); - return ioc; - } + WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); + WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); + atomic_long_inc(&ioc->refcount); + atomic_inc(&ioc->active_ref); +} + +static inline void ioc_task_link(struct io_context *ioc) +{ + get_io_context_active(ioc); - return NULL; + WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); + atomic_inc(&ioc->nr_tasks); } struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); +void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); -void ioc_ioprio_changed(struct io_context *ioc, int ioprio); -void ioc_cgroup_changed(struct io_context *ioc); -unsigned int icq_get_changed(struct io_cq *icq); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 76dad4808847..beb9ce1c2c23 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -42,26 +42,14 @@ enum { }; /* - * if process has set io priority explicitly, use that. if not, convert - * the cpu scheduler nice value to an io priority + * Fallback BE priority */ #define IOPRIO_NORM (4) -static inline int task_ioprio(struct io_context *ioc) -{ - if (ioprio_valid(ioc->ioprio)) - return IOPRIO_PRIO_DATA(ioc->ioprio); - - return IOPRIO_NORM; -} - -static inline int task_ioprio_class(struct io_context *ioc) -{ - if (ioprio_valid(ioc->ioprio)) - return IOPRIO_PRIO_CLASS(ioc->ioprio); - - return IOPRIO_CLASS_BE; -} +/* + * if process has set io priority explicitly, use that. if not, convert + * the cpu scheduler nice value to an io priority + */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; diff --git a/init/Kconfig b/init/Kconfig index 72f33faca44f..2acd917ebf2d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -757,7 +757,7 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED config BLK_CGROUP - tristate "Block IO controller" + bool "Block IO controller" depends on BLOCK default n ---help--- diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..08eb8584e2a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,9 +946,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) |