diff options
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r-- | block/blk-throttle.c | 313 |
1 files changed, 237 insertions, 76 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 252a81a306f7..a62be8d0dc1b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -78,6 +78,8 @@ struct throtl_grp { /* Some throttle limits got updated for the group */ int limits_changed; + + struct rcu_head rcu_head; }; struct throtl_data @@ -88,7 +90,7 @@ struct throtl_data /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; - struct throtl_grp root_tg; + struct throtl_grp *root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) return tg; } -static void throtl_put_tg(struct throtl_grp *tg) +static void throtl_free_tg(struct rcu_head *head) { - BUG_ON(atomic_read(&tg->ref) <= 0); - if (!atomic_dec_and_test(&tg->ref)) - return; + struct throtl_grp *tg; + + tg = container_of(head, struct throtl_grp, rcu_head); + free_percpu(tg->blkg.stats_cpu); kfree(tg); } -static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, - struct blkio_cgroup *blkcg) +static void throtl_put_tg(struct throtl_grp *tg) { - struct throtl_grp *tg = NULL; - void *key = td; - struct backing_dev_info *bdi = &td->queue->backing_dev_info; - unsigned int major, minor; + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; /* - * TODO: Speed up blkiocg_lookup_group() by maintaining a radix - * tree of blkg (instead of traversing through hash list all - * the time. + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. + * + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits */ + call_rcu(&tg->rcu_head, throtl_free_tg); +} - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - tg = &td->root_tg; - else - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); - - /* Fill in device details for root group */ - if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - tg->blkg.dev = MKDEV(major, minor); - goto done; - } - - if (tg) - goto done; - - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); - if (!tg) - goto done; - +static void throtl_init_group(struct throtl_grp *tg) +{ INIT_HLIST_NODE(&tg->tg_node); RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); - td->limits_changed = false; + tg->limits_changed = false; + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; /* * Take the initial reference that will be released on destroy @@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, * exit or cgroup deletion path depending on who is exiting first. */ atomic_set(&tg->ref, 1); +} + +/* Should be called with rcu read lock held (needed for blkcg) */ +static void +throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) +{ + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +} + +static void +__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + if (!tg || tg->blkg.dev) + return; + + /* + * Fill in device details for a group which might not have been + * filled at group creation time as queue was being instantiated + * and driver had not attached a device yet + */ + if (bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + } +} + +/* + * Should be called with without queue lock held. Here queue lock will be + * taken rarely. It will be taken only once during life time of a group + * if need be + */ +static void +throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!tg || tg->blkg.dev) + return; + + spin_lock_irq(td->queue->queue_lock); + __throtl_tg_fill_dev_details(td, tg); + spin_unlock_irq(td->queue->queue_lock); +} + +static void throtl_init_add_tg_lists(struct throtl_data *td, + struct throtl_grp *tg, struct blkio_cgroup *blkcg) +{ + __throtl_tg_fill_dev_details(td, tg); /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, - MKDEV(major, minor), BLKIO_POLICY_THROTL); + tg->blkg.dev, BLKIO_POLICY_THROTL); tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; -done: + throtl_add_group_to_td_list(td, tg); +} + +/* Should be called without queue lock and outside of rcu period */ +static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL; + int ret; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + return NULL; + + ret = blkio_alloc_blkg_stats(&tg->blkg); + + if (ret) { + kfree(tg); + return NULL; + } + + throtl_init_group(tg); return tg; } -static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +static struct +throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) { struct throtl_grp *tg = NULL; + void *key = td; + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + tg = td->root_tg; + else + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + __throtl_tg_fill_dev_details(td, tg); + return tg; +} + +/* + * This function returns with queue lock unlocked in case of error, like + * request queue is no more + */ +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL, *__tg = NULL; struct blkio_cgroup *blkcg; + struct request_queue *q = td->queue; rcu_read_lock(); blkcg = task_blkio_cgroup(current); - tg = throtl_find_alloc_tg(td, blkcg); - if (!tg) - tg = &td->root_tg; + tg = throtl_find_tg(td, blkcg); + if (tg) { + rcu_read_unlock(); + return tg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc + * + * Take the request queue reference to make sure queue does not + * go away once we return from allocation. + */ + blk_get_queue(q); + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + tg = throtl_alloc_tg(td); + /* + * We might have slept in group allocation. Make sure queue is not + * dead + */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + blk_put_queue(q); + if (tg) + kfree(tg); + + return ERR_PTR(-ENODEV); + } + blk_put_queue(q); + + /* Group allocated and queue is still alive. take the lock */ + spin_lock_irq(q->queue_lock); + + /* + * Initialize the new group. After sleeping, read the blkcg again. + */ + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __tg = throtl_find_tg(td, blkcg); + + if (__tg) { + kfree(tg); + rcu_read_unlock(); + return __tg; + } + + /* Group allocation failed. Account the IO to root group */ + if (!tg) { + tg = td->root_tg; + return tg; + } + + throtl_init_add_tg_lists(td, tg, blkcg); rcu_read_unlock(); return tg; } @@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, return 0; } +static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) + return 1; + return 0; +} + /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched @@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - /* - * TODO: This will take blkg->stats_lock. Figure out a way - * to avoid this cost. - */ blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); } @@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) struct throtl_grp *tg; struct bio *bio = *biop; bool rw = bio_data_dir(bio), update_disptime = true; + struct blkio_cgroup *blkcg; if (bio->bi_rw & REQ_THROTTLED) { bio->bi_rw &= ~REQ_THROTTLED; return 0; } + /* + * A throtl_grp pointer retrieved under rcu can be used to access + * basic fields like stats and io rates. If a group has no rules, + * just update the dispatch stats in lockless manner and return. + */ + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + throtl_tg_fill_dev_details(td, tg); + + if (tg_no_rule_group(tg, rw)) { + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, + rw, bio->bi_rw & REQ_SYNC); + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + /* + * Either group has not been allocated yet or it is not an unlimited + * IO group + */ + spin_lock_irq(q->queue_lock); tg = throtl_get_tg(td); + if (IS_ERR(tg)) { + if (PTR_ERR(tg) == -ENODEV) { + /* + * Queue is gone. No queue lock held here. + */ + return -ENODEV; + } + } + if (tg->nr_queued[rw]) { /* * There is already another bio queued in same dir. No @@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q) INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; td->limits_changed = false; + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); - /* Init root group */ - tg = &td->root_tg; - INIT_HLIST_NODE(&tg->tg_node); - RB_CLEAR_NODE(&tg->rb_node); - bio_list_init(&tg->bio_lists[0]); - bio_list_init(&tg->bio_lists[1]); - - /* Practically unlimited BW */ - tg->bps[0] = tg->bps[1] = -1; - tg->iops[0] = tg->iops[1] = -1; - td->limits_changed = false; + /* alloc and Init root group. */ + td->queue = q; + tg = throtl_alloc_tg(td); - /* - * Set root group reference to 2. One reference will be dropped when - * all groups on tg_list are being deleted during queue exit. Other - * reference will remain there as we don't want to delete this group - * as it is statically allocated and gets destroyed when throtl_data - * goes away. - */ - atomic_set(&tg->ref, 2); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; + if (!tg) { + kfree(td); + return -ENOMEM; + } - INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + td->root_tg = tg; rcu_read_lock(); - blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, - 0, BLKIO_POLICY_THROTL); + throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); rcu_read_unlock(); /* Attach throtl data to request queue */ - td->queue = q; q->td = td; return 0; } |