From ea86ea2cdced20057da4d2c32965c1219c238197 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Nov 2018 13:18:06 -0700 Subject: sbitmap: ammortize cost of clearing bits sbitmap maintains a set of words that we use to set and clear bits, with each bit representing a tag for blk-mq. Even though we spread the bits out and maintain a hint cache, one particular bit allocated will end up being cleared in the exact same spot. This introduces batched clearing of bits. Instead of clearing a given bit, the same bit is set in a cleared/free mask instead. If we fail allocating a bit from a given word, then we check the free mask, and batch move those cleared bits at that time. This trades 64 atomic bitops for 2 cmpxchg(). In a threaded poll test case, half the overhead of getting and clearing tags is removed with this change. On another poll test case with a single thread, performance is unchanged. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'include/linux/sbitmap.h') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 804a50983ec5..81359d45751e 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -30,14 +30,24 @@ struct seq_file; */ struct sbitmap_word { /** - * @word: The bitmap word itself. + * @depth: Number of bits being used in @word/@cleared */ - unsigned long word; + unsigned long depth; /** - * @depth: Number of bits being used in @word. + * @word: word holding free bits */ - unsigned long depth; + unsigned long word ____cacheline_aligned_in_smp; + + /** + * @cleared: word holding cleared bits + */ + unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: Held while swapping word <-> cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** @@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } +/* + * This one is special, since it doesn't actually clear the bit, rather it + * sets the corresponding bit in the ->cleared mask instead. Paired with + * the caller doing sbitmap_batch_clear() if a given index is full, which + * will clear the previously freed entries in the corresponding ->word. + */ +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) +{ + unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; + + set_bit(SB_NR_TO_BIT(sb, bitnr), addr); +} + static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { @@ -321,8 +344,6 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } -unsigned int sbitmap_weight(const struct sbitmap *sb); - /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. -- cgit v1.2.3 From 5d2ee7122c73be6a3b6bfe90d237e8aed737cfaa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Nov 2018 17:36:41 -0700 Subject: sbitmap: optimize wakeup check Even if we have no waiters on any of the sbitmap_queue wait states, we still have to loop every entry to check. We do this for every IO, so the cost adds up. Shift a bit of the cost to the slow path, when we actually have waiters. Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain an internal count of how many are currently active. Then we can simply check this count in sbq_wake_ptr() and not have to loop if we don't have any sleepers. Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 11 +++++------ drivers/target/iscsi/iscsi_target_util.c | 12 ++++++----- include/linux/sbitmap.h | 34 ++++++++++++++++++++++++++++++++ lib/sbitmap.c | 28 ++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include/linux/sbitmap.h') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 87bc5df72d48..2089c6c62f44 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; - DEFINE_WAIT(wait); + DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; bool drop_ctx; int tag; @@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - prepare_to_wait_exclusive(&ws->wait, &wait, - TASK_UNINTERRUPTIBLE); + sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != -1) @@ -167,6 +166,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) bt_prev = bt; io_schedule(); + sbitmap_finish_wait(bt, ws, &wait); + data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx->cpu); @@ -176,8 +177,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) else bt = &tags->bitmap_tags; - finish_wait(&ws->wait, &wait); - /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so @@ -192,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (drop_ctx && data->ctx) blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + sbitmap_finish_wait(bt, ws, &wait); found_tag: return tag + tag_offset; diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 36b742932c72..86987da86dd6 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd) static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup) { int tag = -1; - DEFINE_WAIT(wait); + DEFINE_SBQ_WAIT(wait); struct sbq_wait_state *ws; + struct sbitmap_queue *sbq; if (state == TASK_RUNNING) return tag; - ws = &se_sess->sess_tag_pool.ws[0]; + sbq = &se_sess->sess_tag_pool; + ws = &sbq->ws[0]; for (;;) { - prepare_to_wait_exclusive(&ws->wait, &wait, state); + sbitmap_prepare_to_wait(sbq, ws, &wait, state); if (signal_pending_state(state, current)) break; - tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup); + tag = sbitmap_queue_get(sbq, cpup); if (tag >= 0) break; schedule(); } - finish_wait(&ws->wait, &wait); + sbitmap_finish_wait(sbq, ws, &wait); return tag; } diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 81359d45751e..92806a2dbab7 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -135,6 +135,11 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; + /* + * @ws_active: count of currently active ws waitqueues + */ + atomic_t ws_active; + /** * @round_robin: Allocate bits in strict round-robin order. */ @@ -552,4 +557,33 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); +struct sbq_wait { + int accounted; + struct wait_queue_entry wait; +}; + +#define DEFINE_SBQ_WAIT(name) \ + struct sbq_wait name = { \ + .accounted = 0, \ + .wait = { \ + .private = current, \ + .func = autoremove_wake_function, \ + .entry = LIST_HEAD_INIT((name).wait.entry), \ + } \ + } + +/* + * Wrapper around prepare_to_wait_exclusive(), which maintains some extra + * internal state. + */ +void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait, int state); + +/* + * Must be paired with sbitmap_prepare_to_wait(). + */ +void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait); + #endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/lib/sbitmap.c b/lib/sbitmap.c index f99382e59314..a89fbe7cf6ca 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -394,6 +394,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); + atomic_set(&sbq->ws_active, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { @@ -509,6 +510,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) { int i, wake_index; + if (!atomic_read(&sbq->ws_active)) + return NULL; + wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; @@ -634,6 +638,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); + seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { @@ -649,3 +654,26 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); + +void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait, int state) +{ + if (!sbq_wait->accounted) { + atomic_inc(&sbq->ws_active); + sbq_wait->accounted = 1; + } + prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); +} +EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); + +void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait) +{ + finish_wait(&ws->wait, &sbq_wait->wait); + if (sbq_wait->accounted) { + atomic_dec(&sbq->ws_active); + sbq_wait->accounted = 0; + } +} +EXPORT_SYMBOL_GPL(sbitmap_finish_wait); -- cgit v1.2.3 From 8c2def893afc60d88160d524acf345765cf0c447 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 3 Dec 2018 14:45:43 -0800 Subject: sbitmap: fix sbitmap_for_each_set() We need to ignore bits in the cleared mask when iterating over all set bits. Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits") Reported-by: Jens Axboe@kernel.dk> Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux/sbitmap.h') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 92806a2dbab7..03f50fcedc79 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -265,12 +265,14 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb, nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { - struct sbitmap_word *word = &sb->map[index]; - unsigned int depth = min_t(unsigned int, word->depth - nr, + unsigned long word; + unsigned int depth = min_t(unsigned int, + sb->map[index].depth - nr, sb->depth - scanned); scanned += depth; - if (!word->word) + word = sb->map[index].word & ~sb->map[index].cleared; + if (!word) goto next; /* @@ -280,7 +282,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb, */ depth += nr; while (1) { - nr = find_next_bit(&word->word, depth, nr); + nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) -- cgit v1.2.3 From 9f6b7ef6c3ebe35be77b0ae3cf12e4d25ae80420 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Dec 2018 08:49:00 -0700 Subject: sbitmap: add helpers for add/del wait queue handling After commit 5d2ee7122c73, users of sbitmap that need wait queue handling must use the provided helpers. But we only added prepare_to_wait()/finish_wait() style helpers, add the equivalent add_wait_queue/list_del wrappers as we.. This is needed to ensure kyber plays by the sbitmap waitqueue rules. Tested-by: Ming Lei Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 16 ++++++++++++++-- lib/sbitmap.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/linux/sbitmap.h') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 03f50fcedc79..14d558146aea 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -560,13 +560,13 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { - int accounted; + struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ - .accounted = 0, \ + .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ @@ -588,4 +588,16 @@ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); +/* + * Wrapper around add_wait_queue(), which maintains some extra internal state + */ +void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait); + +/* + * Must be paired with sbitmap_add_wait_queue() + */ +void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); + #endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 5b3e56d68dab..65c2d06250a6 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -671,13 +671,35 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) } EXPORT_SYMBOL_GPL(sbitmap_queue_show); +void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, + struct sbq_wait_state *ws, + struct sbq_wait *sbq_wait) +{ + if (!sbq_wait->sbq) { + sbq_wait->sbq = sbq; + atomic_inc(&sbq->ws_active); + } + add_wait_queue(&ws->wait, &sbq_wait->wait); +} +EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); + +void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) +{ + list_del_init(&sbq_wait->wait.entry); + if (sbq_wait->sbq) { + atomic_dec(&sbq_wait->sbq->ws_active); + sbq_wait->sbq = NULL; + } +} +EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); + void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { - if (!sbq_wait->accounted) { + if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); - sbq_wait->accounted = 1; + sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } @@ -687,9 +709,9 @@ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); - if (sbq_wait->accounted) { + if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); - sbq_wait->accounted = 0; + sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait); -- cgit v1.2.3 From 1e4471e74c75acb3f89959ffa02a241227937ae2 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Sat, 16 Mar 2019 16:24:37 +0800 Subject: sbitmap: trivial - update comment for sbitmap_deferred_clear_bit "sbitmap_batch_clear" should be "sbitmap_deferred_clear" Acked-by: Omar Sandoval Signed-off-by: Shenghui Wang Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sbitmap.h') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 14d558146aea..20f3e3f029b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with - * the caller doing sbitmap_batch_clear() if a given index is full, which + * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) -- cgit v1.2.3