summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 21:43:59 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 21:43:59 +0300
commitce8a79d5601aab94c02ed4539c48e8605422ac94 (patch)
tree7830a97a475d57284640c8e2d3516521722708b6 /block
parent96f7e448b9f4546ffd0356ffceb2b9586777f316 (diff)
parentf596da3efaf4130ff61cd029558845808df9bf99 (diff)
downloadlinux-ce8a79d5601aab94c02ed4539c48e8605422ac94.tar.xz
Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe pull requests via Christoph: - Support some passthrough commands without CAP_SYS_ADMIN (Kanchan Joshi) - Refactor PCIe probing and reset (Christoph Hellwig) - Various fabrics authentication fixes and improvements (Sagi Grimberg) - Avoid fallback to sequential scan due to transient issues (Uday Shankar) - Implement support for the DEAC bit in Write Zeroes (Christoph Hellwig) - Allow overriding the IEEE OUI and firmware revision in configfs for nvmet (Aleksandr Miloserdov) - Force reconnect when number of queue changes in nvmet (Daniel Wagner) - Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi Grimberg, Christoph Hellwig, Christophe JAILLET) - Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni) - Use the common tagset helpers in nvme-pci driver (Christoph Hellwig) - Cleanup the nvme-pci removal path (Christoph Hellwig) - Use kstrtobool() instead of strtobool (Christophe JAILLET) - Allow unprivileged passthrough of Identify Controller (Joel Granados) - Support io stats on the mpath device (Sagi Grimberg) - Minor nvmet cleanup (Sagi Grimberg) - MD pull requests via Song: - Code cleanups (Christoph) - Various fixes - Floppy pull request from Denis: - Fix a memory leak in the init error path (Yuan) - Series fixing some batch wakeup issues with sbitmap (Gabriel) - Removal of the pktcdvd driver that was deprecated more than 5 years ago, and subsequent removal of the devnode callback in struct block_device_operations as no users are now left (Greg) - Fix for partition read on an exclusively opened bdev (Jan) - Series of elevator API cleanups (Jinlong, Christoph) - Series of fixes and cleanups for blk-iocost (Kemeng) - Series of fixes and cleanups for blk-throttle (Kemeng) - Series adding concurrent support for sync queues in BFQ (Yu) - Series bringing drbd a bit closer to the out-of-tree maintained version (Christian, Joel, Lars, Philipp) - Misc drbd fixes (Wang) - blk-wbt fixes and tweaks for enable/disable (Yu) - Fixes for mq-deadline for zoned devices (Damien) - Add support for read-only and offline zones for null_blk (Shin'ichiro) - Series fixing the delayed holder tracking, as used by DM (Yu, Christoph) - Series enabling bio alloc caching for IRQ based IO (Pavel) - Series enabling userspace peer-to-peer DMA (Logan) - BFQ waker fixes (Khazhismel) - Series fixing elevator refcount issues (Christoph, Jinlong) - Series cleaning up references around queue destruction (Christoph) - Series doing quiesce by tagset, enabling cleanups in drivers (Christoph, Chao) - Series untangling the queue kobject and queue references (Christoph) - Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye, Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph) * tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits) blktrace: Fix output non-blktrace event when blk_classic option enabled block: sed-opal: Don't include <linux/kernel.h> sed-opal: allow using IOC_OPAL_SAVE for locking too blk-cgroup: Fix typo in comment block: remove bio_set_op_attrs nvmet: don't open-code NVME_NS_ATTR_RO enumeration nvme-pci: use the tagset alloc/free helpers nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers nvme: consolidate setting the tagset flags nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set block: bio_copy_data_iter nvme-pci: split out a nvme_pci_ctrl_is_dead helper nvme-pci: return early on ctrl state mismatch in nvme_reset_work nvme-pci: rename nvme_disable_io_queues nvme-pci: cleanup nvme_suspend_queue nvme-pci: remove nvme_pci_disable nvme-pci: remove nvme_disable_admin_queue nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl nvme: use nvme_wait_ready in nvme_shutdown_ctrl ...
Diffstat (limited to 'block')
-rw-r--r--block/bdev.c4
-rw-r--r--block/bfq-cgroup.c12
-rw-r--r--block/bfq-iosched.c102
-rw-r--r--block/bfq-iosched.h32
-rw-r--r--block/bfq-wf2q.c157
-rw-r--r--block/bio.c146
-rw-r--r--block/blk-cgroup.c94
-rw-r--r--block/blk-cgroup.h10
-rw-r--r--block/blk-core.c83
-rw-r--r--block/blk-crypto-internal.h22
-rw-r--r--block/blk-crypto-profile.c1
-rw-r--r--block/blk-crypto-sysfs.c11
-rw-r--r--block/blk-crypto.c37
-rw-r--r--block/blk-ia-ranges.c3
-rw-r--r--block/blk-iocost.c57
-rw-r--r--block/blk-iolatency.c37
-rw-r--r--block/blk-map.c14
-rw-r--r--block/blk-merge.c44
-rw-r--r--block/blk-mq-sched.c8
-rw-r--r--block/blk-mq-sysfs.c11
-rw-r--r--block/blk-mq.c229
-rw-r--r--block/blk-mq.h14
-rw-r--r--block/blk-settings.c6
-rw-r--r--block/blk-sysfs.c137
-rw-r--r--block/blk-throttle.c102
-rw-r--r--block/blk-wbt.c26
-rw-r--r--block/blk-wbt.h17
-rw-r--r--block/blk.h27
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/bsg.c11
-rw-r--r--block/elevator.c254
-rw-r--r--block/elevator.h20
-rw-r--r--block/fops.c7
-rw-r--r--block/genhd.c35
-rw-r--r--block/holder.c103
-rw-r--r--block/ioctl.c12
-rw-r--r--block/mq-deadline.c83
-rw-r--r--block/sed-opal.c39
38 files changed, 1132 insertions, 877 deletions
diff --git a/block/bdev.c b/block/bdev.c
index d699ecdb3260..edc110d90df4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev)
EXPORT_SYMBOL(fsync_bdev);
/**
- * freeze_bdev -- lock a filesystem and force it into a consistent state
+ * freeze_bdev - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
@@ -268,7 +268,7 @@ done:
EXPORT_SYMBOL(freeze_bdev);
/**
- * thaw_bdev -- unlock filesystem
+ * thaw_bdev - unlock filesystem
* @bdev: blockdevice to unlock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 7d624a3a3f0f..627476bc6495 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
{
blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ if (!(bfqq == bfqg->bfqd->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
@@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
*/
bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
+ bfqg->num_queues_with_pending_reqs = 0;
bfqg->online = true;
bfqg->rq_pos_tree = RB_ROOT;
}
@@ -645,6 +646,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_entity *entity = &bfqq->entity;
struct bfq_group *old_parent = bfqq_group(bfqq);
+ bool has_pending_reqs = false;
/*
* No point to move bfqq to the same group, which can happen when
@@ -665,6 +667,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfqq->ref++;
+ if (entity->in_groups_with_pending_reqs) {
+ has_pending_reqs = true;
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ }
+
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@@ -692,6 +699,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* pin down bfqg and its associated blkg */
bfqg_and_blkg_get(bfqg);
+ if (has_pending_reqs)
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
+
if (bfq_bfqq_busy(bfqq)) {
if (unlikely(!bfqd->nonrot_with_queueing))
bfq_pos_tree_add_move(bfqd, bfqq);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 7ea427817f7f..a72304c728fc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
* 2) all active queues belong to the same I/O-priority class,
- * 3) there are no active groups.
+ * 3) there is at most one active group.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
@@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
return varied_queue_weights || multiple_classes_busy
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- || bfqd->num_groups_with_pending_reqs > 0
+ || bfqd->num_groups_with_pending_reqs > 1
#endif
;
}
@@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_add(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree;
struct bfq_entity *entity = &bfqq->entity;
struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
bool leftmost = true;
@@ -944,13 +944,14 @@ inc_counter:
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_remove(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root;
+
if (!bfqq->weight_counter)
return;
+ root = &bfqq->bfqd->queue_weights_tree;
bfqq->weight_counter->num_active--;
if (bfqq->weight_counter->num_active > 0)
goto reset_entity_pointer;
@@ -964,59 +965,6 @@ reset_entity_pointer:
}
/*
- * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
- * of active groups for each queue's inactive parent entity.
- */
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = bfqq->entity.parent;
-
- for_each_entity(entity) {
- struct bfq_sched_data *sd = entity->my_sched_data;
-
- if (sd->next_in_service || sd->in_service_entity) {
- /*
- * entity is still active, because either
- * next_in_service or in_service_entity is not
- * NULL (see the comments on the definition of
- * next_in_service for details on why
- * in_service_entity must be checked too).
- *
- * As a consequence, its parent entities are
- * active as well, and thus this loop must
- * stop here.
- */
- break;
- }
-
- /*
- * The decrement of num_groups_with_pending_reqs is
- * not performed immediately upon the deactivation of
- * entity, but it is delayed to when it also happens
- * that the first leaf descendant bfqq of entity gets
- * all its pending requests completed. The following
- * instructions perform this delayed decrement, if
- * needed. See the comments on
- * num_groups_with_pending_reqs for details.
- */
- if (entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = false;
- bfqd->num_groups_with_pending_reqs--;
- }
- }
-
- /*
- * Next function is invoked last, because it causes bfqq to be
- * freed if the following holds: bfqq is not in service and
- * has no dispatched request. DO NOT use bfqq after the next
- * function invocation.
- */
- __bfq_weights_tree_remove(bfqd, bfqq,
- &bfqd->queue_weights_tree);
-}
-
-/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -2135,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
- now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC)
+ now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
+ bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq ||
+ bfqq == &bfqd->oom_bfqq)
return;
/*
@@ -2373,22 +2323,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
return 0;
}
-#if 0 /* Still not clear if we can do without next two functions */
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver++;
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver--;
-}
-#endif
-
static void bfq_remove_request(struct request_queue *q,
struct request *rq)
{
@@ -6261,7 +6195,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
- bfq_weights_tree_remove(bfqd, bfqq);
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
@@ -6784,6 +6719,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
+ if (unlikely(bfqq == &bfqd->oom_bfqq))
+ bfqq_already_existing = true;
+ } else
+ bfqq_already_existing = true;
+
+ if (!bfqq_already_existing) {
bfqq->waker_bfqq = old_bfqq->waker_bfqq;
bfqq->tentative_waker_bfqq = NULL;
@@ -6797,8 +6738,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (bfqq->waker_bfqq)
hlist_add_head(&bfqq->woken_list_node,
&bfqq->waker_bfqq->woken_list);
- } else
- bfqq_already_existing = true;
+ }
}
}
@@ -7045,6 +6985,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
+ clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
wbt_enable_default(bfqd->queue);
kfree(bfqd);
@@ -7190,6 +7131,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+ set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
wbt_disable_default(q);
blk_stat_enable_accounting(q);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 71f721670ab6..9fa89577322d 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -492,27 +492,27 @@ struct bfq_data {
struct rb_root_cached queue_weights_tree;
/*
- * Number of groups with at least one descendant process that
+ * Number of groups with at least one process that
* has at least one request waiting for completion. Note that
* this accounts for also requests already dispatched, but not
* yet completed. Therefore this number of groups may differ
* (be larger) than the number of active groups, as a group is
* considered active only if its corresponding entity has
- * descendant queues with at least one request queued. This
+ * queues with at least one request queued. This
* number is used to decide whether a scenario is symmetric.
* For a detailed explanation see comments on the computation
* of the variable asymmetric_scenario in the function
* bfq_better_to_idle().
*
* However, it is hard to compute this number exactly, for
- * groups with multiple descendant processes. Consider a group
- * that is inactive, i.e., that has no descendant process with
+ * groups with multiple processes. Consider a group
+ * that is inactive, i.e., that has no process with
* pending I/O inside BFQ queues. Then suppose that
* num_groups_with_pending_reqs is still accounting for this
- * group, because the group has descendant processes with some
+ * group, because the group has processes with some
* I/O request still in flight. num_groups_with_pending_reqs
* should be decremented when the in-flight request of the
- * last descendant process is finally completed (assuming that
+ * last process is finally completed (assuming that
* nothing else has changed for the group in the meantime, in
* terms of composition of the group and active/inactive state of child
* groups and processes). To accomplish this, an additional
@@ -521,7 +521,7 @@ struct bfq_data {
* we resort to the following tradeoff between simplicity and
* accuracy: for an inactive group that is still counted in
* num_groups_with_pending_reqs, we decrement
- * num_groups_with_pending_reqs when the first descendant
+ * num_groups_with_pending_reqs when the first
* process of the group remains with no request waiting for
* completion.
*
@@ -529,12 +529,12 @@ struct bfq_data {
* carefulness: to avoid multiple decrements, we flag a group,
* more precisely an entity representing a group, as still
* counted in num_groups_with_pending_reqs when it becomes
- * inactive. Then, when the first descendant queue of the
+ * inactive. Then, when the first queue of the
* entity remains with no request waiting for completion,
* num_groups_with_pending_reqs is decremented, and this flag
* is reset. After this flag is reset for the entity,
* num_groups_with_pending_reqs won't be decremented any
- * longer in case a new descendant queue of the entity remains
+ * longer in case a new queue of the entity remains
* with no request waiting for completion.
*/
unsigned int num_groups_with_pending_reqs;
@@ -931,7 +931,7 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- void *bfqd;
+ struct bfq_data *bfqd;
struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
struct bfq_queue *async_idle_bfqq;
@@ -939,6 +939,7 @@ struct bfq_group {
struct bfq_entity *my_entity;
int active_entities;
+ int num_queues_with_pending_reqs;
struct rb_root rq_pos_tree;
@@ -968,13 +969,8 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_queue *bfqq);
+void bfq_weights_tree_remove(struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
@@ -1078,6 +1074,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 8fc3da4c23bb..b02b53658ed4 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return false;
}
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities++;
+}
+
+static void bfq_dec_active_entities(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities--;
+}
+
#else /* CONFIG_BFQ_GROUP_IOSCHED */
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
@@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return true;
}
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+}
+
+static void bfq_dec_active_entities(struct bfq_entity *entity)
+{
+}
+
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
/*
@@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
bfq_insert(&st->active, entity);
@@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_update_active_tree(node);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities++;
-#endif
+
+ bfq_inc_active_entities(entity);
}
/**
@@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
if (node)
bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities--;
-#endif
+
+ bfq_dec_active_entities(entity);
}
/**
@@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (entity->prio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned int prev_weight, new_weight;
- struct bfq_data *bfqd = NULL;
- struct rb_root_cached *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd;
- struct bfq_group *bfqg;
-#endif
-
- if (bfqq)
- bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- sd = entity->my_sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
- }
-#endif
/* Matches the smp_wmb() in bfq_group_set_weight. */
smp_rmb();
@@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
* queue, remove the entity from its old weight counter (if
* there is a counter associated with the entity).
*/
- if (prev_weight != new_weight && bfqq) {
- root = &bfqd->queue_weights_tree;
- __bfq_weights_tree_remove(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq)
+ bfq_weights_tree_remove(bfqq);
entity->weight = new_weight;
/*
* Add the entity, if it is not a weight-raised queue,
* to the counter associated with its new weight.
*/
- if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
- /* If we get here, root has been initialized. */
- bfq_weights_tree_add(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqq);
new_st->wsum += entity->weight;
@@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st_or_in_serv = true;
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
- struct bfq_data *bfqd = bfqg->bfqd;
-
- if (!entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = true;
- bfqd->num_groups_with_pending_reqs++;
- }
- }
-#endif
-
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity)
}
static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
- struct bfq_sched_data *sd,
bool non_blocking_wait_rq)
{
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
- if (sd->in_service_entity == entity || entity->tree == &st->active)
+ if (entity->sched_data->in_service_entity == entity ||
+ entity->tree == &st->active)
/*
* in service or already queued on the active tree,
* requeue or reposition
@@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
bool non_blocking_wait_rq,
bool requeue, bool expiration)
{
- struct bfq_sched_data *sd;
-
for_each_entity(entity) {
- sd = entity->sched_data;
- __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
- if (!bfq_update_next_in_service(sd, entity, expiration) &&
- !requeue)
+ __bfq_activate_requeue_entity(entity, non_blocking_wait_rq);
+ if (!bfq_update_next_in_service(entity->sched_data, entity,
+ expiration) && !requeue)
break;
}
}
@@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq == bfqd->in_service_queue, expiration);
}
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (!entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = true;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++))
+ bfqq->bfqd->num_groups_with_pending_reqs++;
+#endif
+ }
+}
+
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = false;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs))
+ bfqq->bfqd->num_groups_with_pending_reqs--;
+#endif
+ }
+}
+
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree. As a special case, it can be invoked during an
@@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
- if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, bfqq);
+ if (!bfqq->dispatched) {
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ /*
+ * Next function is invoked last, because it causes bfqq to be
+ * freed. DO NOT use bfqq after the next function invocation.
+ */
+ bfq_weights_tree_remove(bfqq);
+ }
}
/*
@@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
- if (!bfqq->dispatched)
+ if (!bfqq->dispatched) {
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
if (bfqq->wr_coeff == 1)
- bfq_weights_tree_add(bfqd, bfqq,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_add(bfqq);
+ }
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;
diff --git a/block/bio.c b/block/bio.c
index 57c2f327225b..5f96fcae3f75 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,9 +25,15 @@
#include "blk-rq-qos.h"
#include "blk-cgroup.h"
+#define ALLOC_CACHE_THRESHOLD 16
+#define ALLOC_CACHE_SLACK 64
+#define ALLOC_CACHE_MAX 256
+
struct bio_alloc_cache {
struct bio *free_list;
+ struct bio *free_list_irq;
unsigned int nr;
+ unsigned int nr_irq;
};
static struct biovec_slab {
@@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}
+static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
+{
+ unsigned long flags;
+
+ /* cache->free_list must be empty */
+ if (WARN_ON_ONCE(cache->free_list))
+ return;
+
+ local_irq_save(flags);
+ cache->free_list = cache->free_list_irq;
+ cache->free_list_irq = NULL;
+ cache->nr += cache->nr_irq;
+ cache->nr_irq = 0;
+ local_irq_restore(flags);
+}
+
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
@@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache = per_cpu_ptr(bs->cache, get_cpu());
if (!cache->free_list) {
- put_cpu();
- return NULL;
+ if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
+ bio_alloc_irq_cache_splice(cache);
+ if (!cache->free_list) {
+ put_cpu();
+ return NULL;
+ }
}
bio = cache->free_list;
cache->free_list = bio->bi_next;
@@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
- * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process
- * context, not hard/soft IRQ.
- *
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
@@ -526,6 +549,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
}
if (unlikely(!p))
return NULL;
+ if (!mempool_is_saturated(&bs->bio_pool))
+ opf &= ~REQ_ALLOC_CACHE;
bio = p + bs->front_pad;
if (nr_vecs > BIO_INLINE_VECS) {
@@ -676,11 +701,8 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
-#define ALLOC_CACHE_MAX 512
-#define ALLOC_CACHE_SLACK 64
-
-static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
- unsigned int nr)
+static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
{
unsigned int i = 0;
struct bio *bio;
@@ -692,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
if (++i == nr)
break;
}
+ return i;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ nr -= __bio_alloc_cache_prune(cache, nr);
+ if (!READ_ONCE(cache->free_list)) {
+ bio_alloc_irq_cache_splice(cache);
+ __bio_alloc_cache_prune(cache, nr);
+ }
}
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
@@ -725,6 +758,35 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
bs->cache = NULL;
}
+static inline void bio_put_percpu_cache(struct bio *bio)
+{
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
+ put_cpu();
+ bio_free(bio);
+ return;
+ }
+
+ bio_uninit(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
+ bio->bi_next = cache->free_list;
+ cache->free_list = bio;
+ cache->nr++;
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ bio->bi_next = cache->free_list_irq;
+ cache->free_list_irq = bio;
+ cache->nr_irq++;
+ local_irq_restore(flags);
+ }
+ put_cpu();
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -740,20 +802,10 @@ void bio_put(struct bio *bio)
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
-
- if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
- struct bio_alloc_cache *cache;
-
- bio_uninit(bio);
- cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
- bio->bi_next = cache->free_list;
- cache->free_list = bio;
- if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
- bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
- put_cpu();
- } else {
+ if (bio->bi_opf & REQ_ALLOC_CACHE)
+ bio_put_percpu_cache(bio);
+ else
bio_free(bio);
- }
}
EXPORT_SYMBOL(bio_put);
@@ -863,6 +915,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (*same_page)
@@ -1195,6 +1249,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
+ unsigned int gup_flags = 0;
ssize_t size, left;
unsigned len, i = 0;
size_t offset, trim;
@@ -1208,6 +1263,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
+ if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
+ gup_flags |= FOLL_PCI_P2PDMA;
+
/*
* Each segment in the iov is required to be a block size multiple.
* However, we may not be able to get the entire segment if it spans
@@ -1215,8 +1273,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
*/
- size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
- nr_pages, &offset);
+ size = iov_iter_get_pages(iter, pages,
+ UINT_MAX - bio->bi_iter.bi_size,
+ nr_pages, &offset, gup_flags);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
@@ -1342,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(__bio_advance);
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
- struct bio *src, struct bvec_iter *src_iter)
-{
- while (src_iter->bi_size && dst_iter->bi_size) {
- struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
- struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
- unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
- void *src_buf = bvec_kmap_local(&src_bv);
- void *dst_buf = bvec_kmap_local(&dst_bv);
-
- memcpy(dst_buf, src_buf, bytes);
-
- kunmap_local(dst_buf);
- kunmap_local(src_buf);
-
- bio_advance_iter_single(src, src_iter, bytes);
- bio_advance_iter_single(dst, dst_iter, bytes);
- }
-}
-EXPORT_SYMBOL(bio_copy_data_iter);
-
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
@@ -1376,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
struct bvec_iter src_iter = src->bi_iter;
struct bvec_iter dst_iter = dst->bi_iter;
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ while (src_iter.bi_size && dst_iter.bi_size) {
+ struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf = bvec_kmap_local(&src_bv);
+ void *dst_buf = bvec_kmap_local(&dst_bv);
+
+ memcpy(dst_buf, src_buf, bytes);
+
+ kunmap_local(dst_buf);
+ kunmap_local(src_buf);
+
+ bio_advance_iter_single(src, &src_iter, bytes);
+ bio_advance_iter_single(dst, &dst_iter, bytes);
+ }
}
EXPORT_SYMBOL(bio_copy_data);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ed761c62ad0a..50ac0dce95b8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
#define BLKG_DESTROY_BATCH_SIZE 64
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+ int cpu;
+
+ blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+ if (!blkcg->lhead)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
+ return 0;
+}
+
/**
* blkcg_css - find the current css
*
@@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
blkg->blkcg = blkcg;
u64_stats_init(&blkg->iostat.sync);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+ per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+ }
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -577,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
* @pd: policy private data of interest
* @v: value to print
*
- * Print @v to @sf for the device assocaited with @pd.
+ * Print @v to @sf for the device associated with @pd.
*/
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
@@ -765,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
- * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ * @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
*
* Finish up after per-blkg config update. This function must be paired
* with blkg_conf_prep().
@@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+ struct llist_node *lnode;
+ struct blkg_iostat_set *bisc, *next_bisc;
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(css->cgroup))
@@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
rcu_read_lock();
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ lnode = llist_del_all(lhead);
+ if (!lnode)
+ goto out;
+
+ /*
+ * Iterate only the iostat_cpu's queued in the lockless list.
+ */
+ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+ struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur;
unsigned int seq;
+ WRITE_ONCE(bisc->lqueued, false);
+
/* fetch the current per-cpu values */
do {
seq = u64_stats_fetch_begin(&bisc->sync);
@@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
+ percpu_ref_put(&blkg->refcnt);
}
+out:
rcu_read_unlock();
}
@@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&blkcg_pol_mutex);
+ free_percpu(blkcg->lhead);
kfree(blkcg);
}
@@ -1139,7 +1186,6 @@ static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct blkcg *blkcg;
- struct cgroup_subsys_state *ret;
int i;
mutex_lock(&blkcg_pol_mutex);
@@ -1148,12 +1194,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg = &blkcg_root;
} else {
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg) {
- ret = ERR_PTR(-ENOMEM);
+ if (!blkcg)
goto unlock;
- }
}
+ if (init_blkcg_llists(blkcg))
+ goto free_blkcg;
+
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@@ -1168,10 +1215,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
continue;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd) {
- ret = ERR_PTR(-ENOMEM);
+ if (!cpd)
goto free_pd_blkcg;
- }
+
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
@@ -1195,12 +1241,13 @@ free_pd_blkcg:
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+ free_percpu(blkcg->lhead);
+free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
mutex_unlock(&blkcg_pol_mutex);
- return ret;
+ return ERR_PTR(-ENOMEM);
}
static int blkcg_css_online(struct cgroup_subsys_state *css)
@@ -1784,7 +1831,7 @@ out:
/**
* blkcg_schedule_throttle - this task needs to check for throttling
- * @gendisk: disk to throttle
+ * @disk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI
*
* This is called by the IO controller when we know there's delay accumulated
@@ -1943,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio)
{
+ struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
unsigned long flags;
@@ -1961,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
+ /*
+ * If the iostat_cpu isn't in a lockless list, put it into the
+ * list to indicate that a stat update is pending.
+ */
+ if (!READ_ONCE(bis->lqueued)) {
+ struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+ llist_add(&bis->lnode, lhead);
+ WRITE_ONCE(bis->lqueued, true);
+ percpu_ref_get(&bis->blkg->refcnt);
+ }
+
u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
- cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index aa2b286bc825..1e94e404eaa8 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
+#include <linux/llist.h>
struct blkcg_gq;
struct blkg_policy_data;
@@ -43,6 +44,9 @@ struct blkg_iostat {
struct blkg_iostat_set {
struct u64_stats_sync sync;
+ struct blkcg_gq *blkg;
+ struct llist_node lnode;
+ int lqueued; /* queued in llist */
struct blkg_iostat cur;
struct blkg_iostat last;
};
@@ -97,6 +101,12 @@ struct blkcg {
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
+
+ /*
+ * List of updated percpu blkg_iostat_set's since the last flush.
+ */
+ struct llist_head __percpu *lhead;
+
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif
diff --git a/block/blk-core.c b/block/blk-core.c
index 5487912befe8..3866b6c4cd88 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -59,13 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
-DEFINE_IDA(blk_queue_ida);
+static DEFINE_IDA(blk_queue_ida);
/*
* For queue allocation
*/
-struct kmem_cache *blk_requestq_cachep;
-struct kmem_cache *blk_requestq_srcu_cachep;
+static struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
@@ -253,19 +252,44 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
+{
+ kmem_cache_free(blk_requestq_cachep,
+ container_of(rcu_head, struct request_queue, rcu_head));
+}
+
+static void blk_free_queue(struct request_queue *q)
+{
+ percpu_ref_exit(&q->q_usage_counter);
+
+ if (q->poll_stat)
+ blk_stat_remove_callback(q, q->poll_cb);
+ blk_stat_free_callback(q->poll_cb);
+
+ blk_free_queue_stats(q->stats);
+ kfree(q->poll_stat);
+
+ if (queue_is_mq(q))
+ blk_mq_release(q);
+
+ ida_free(&blk_queue_ida, q->id);
+ call_rcu(&q->rcu_head, blk_free_queue_rcu);
+}
+
/**
* blk_put_queue - decrement the request_queue refcount
* @q: the request_queue structure to decrement the refcount for
*
- * Decrements the refcount of the request_queue kobject. When this reaches 0
- * we'll have blk_release_queue() called.
+ * Decrements the refcount of the request_queue and free it when the refcount
+ * reaches 0.
*
- * Context: Any context, but the last reference must not be dropped from
- * atomic context.
+ * Context: Can sleep.
*/
void blk_put_queue(struct request_queue *q)
{
- kobject_put(&q->kobj);
+ might_sleep();
+ if (refcount_dec_and_test(&q->refs))
+ blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);
@@ -373,26 +397,20 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
+struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
- q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
+ node_id);
if (!q)
return NULL;
- if (alloc_srcu) {
- blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
- if (init_srcu_struct(q->srcu) != 0)
- goto fail_q;
- }
-
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
- goto fail_srcu;
+ goto fail_q;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
@@ -406,8 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
- kobject_init(&q->kobj, &blk_queue_ktype);
-
+ refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
@@ -434,11 +451,8 @@ fail_stats:
blk_free_queue_stats(q->stats);
fail_id:
ida_free(&blk_queue_ida, q->id);
-fail_srcu:
- if (alloc_srcu)
- cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
+ kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
@@ -454,7 +468,7 @@ bool blk_get_queue(struct request_queue *q)
{
if (unlikely(blk_queue_dying(q)))
return false;
- kobject_get(&q->kobj);
+ refcount_inc(&q->refs);
return true;
}
EXPORT_SYMBOL(blk_get_queue);
@@ -945,18 +959,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev,
EXPORT_SYMBOL(bdev_start_io_acct);
/**
- * bio_start_io_acct_time - start I/O accounting for bio based drivers
- * @bio: bio to start account for
- * @start_time: start time that should be passed back to bio_end_io_acct().
- */
-void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
-{
- bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
- bio_op(bio), start_time);
-}
-EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
-
-/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
*
@@ -1183,9 +1185,6 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
- BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
- __alignof__(struct request_queue)) !=
- sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1196,10 +1195,6 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
- blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
- sizeof(struct request_queue) +
- sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
-
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index e6818ffaddbf..a8cdaf26851e 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-int blk_crypto_sysfs_register(struct request_queue *q);
+int blk_crypto_sysfs_register(struct gendisk *disk);
-void blk_crypto_sysfs_unregister(struct request_queue *q);
+void blk_crypto_sysfs_unregister(struct gendisk *disk);
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -65,14 +65,28 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return rq->crypt_ctx;
}
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
-static inline int blk_crypto_sysfs_register(struct request_queue *q)
+static inline int blk_crypto_sysfs_register(struct gendisk *disk)
{
return 0;
}
-static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
+{
+}
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 96c511967386..0307fb0d95d3 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -32,6 +32,7 @@
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include "blk-crypto-internal.h"
struct blk_crypto_keyslot {
atomic_t slot_refs;
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index fd93bd2f33b7..55268edc0625 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = {
* If the request_queue has a blk_crypto_profile, create the "crypto"
* subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
*/
-int blk_crypto_sysfs_register(struct request_queue *q)
+int blk_crypto_sysfs_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_crypto_kobj *obj;
int err;
@@ -139,8 +140,8 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return -ENOMEM;
obj->profile = q->crypto_profile;
- err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
- "crypto");
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype,
+ &disk->queue_kobj, "crypto");
if (err) {
kobject_put(&obj->kobj);
return err;
@@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return 0;
}
-void blk_crypto_sysfs_unregister(struct request_queue *q)
+void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
- kobject_put(q->crypto_kobject);
+ kobject_put(disk->queue->crypto_kobject);
}
static int __init blk_crypto_sysfs_init(void)
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index e44709fc6a08..45378586151f 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -273,7 +273,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
- struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -290,10 +289,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
- profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
- if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bio->bi_bdev,
+ &bc_key->crypto_cfg))
return true;
-
if (blk_crypto_fallback_bio_prep(bio_ptr))
return true;
fail:
@@ -358,22 +356,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return 0;
}
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+ const struct blk_crypto_config *cfg)
+{
+ return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+ cfg);
+}
+
/*
* Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
- * request queue it's submitted to supports inline crypto, or the
+ * block_device it's submitted to supports inline crypto, or the
* blk-crypto-fallback is enabled and supports the cfg).
*/
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- __blk_crypto_cfg_supported(q->crypto_profile, cfg);
+ blk_crypto_config_supported_natively(bdev, cfg);
}
/**
* blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @bdev: block device to operate on
* @key: A key to use on the device
- * @q: the request queue for the device
*
* Upper layers must call this function to ensure that either the hardware
* supports the key's crypto settings, or the crypto API fallback has transforms
@@ -385,10 +390,10 @@ bool blk_crypto_config_supported(struct request_queue *q,
* blk-crypto-fallback is either disabled or the needed algorithm
* is disabled in the crypto API; or another -errno code.
*/
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
- struct request_queue *q)
+int blk_crypto_start_using_key(struct block_device *bdev,
+ const struct blk_crypto_key *key)
{
- if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -396,7 +401,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
/**
* blk_crypto_evict_key() - Evict a key from any inline encryption hardware
* it may have been programmed into
- * @q: The request queue who's associated inline encryption hardware this key
+ * @bdev: The block_device who's associated inline encryption hardware this key
* might have been programmed into
* @key: The key to evict
*
@@ -406,14 +411,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
*
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
- if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return __blk_crypto_evict_key(q->crypto_profile, key);
/*
- * If the request_queue didn't support the key, then blk-crypto-fallback
+ * If the block_device didn't support the key, then blk-crypto-fallback
* may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 2bd1d311033b..2141931ddd37 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
*/
WARN_ON(iars->sysfs_registered);
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
- &q->kobj, "%s", "independent_access_ranges");
+ &disk->queue_kobj, "%s",
+ "independent_access_ranges");
if (ret) {
disk->ia_ranges = NULL;
kobject_put(&iars->kobj);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 495396425bad..d1bdc12deaa7 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -111,7 +111,7 @@
* busy signal.
*
* As devices can have deep queues and be unfair in how the queued commands
- * are executed, soley depending on rq wait may not result in satisfactory
+ * are executed, solely depending on rq wait may not result in satisfactory
* control quality. For a better control quality, completion latency QoS
* parameters can be configured so that the device is considered saturated
* if N'th percentile completion latency rises above the set point.
@@ -556,7 +556,6 @@ struct ioc_now {
u64 now_ns;
u64 now;
u64 vnow;
- u64 vrate;
};
struct iocg_wait {
@@ -906,8 +905,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
if (idx == ioc->autop_idx && !force)
return false;
- if (idx != ioc->autop_idx)
+ if (idx != ioc->autop_idx) {
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+ ioc->vtime_base_rate = VTIME_PER_USEC;
+ }
ioc->autop_idx = idx;
ioc->autop_too_fast_at = 0;
@@ -975,7 +976,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
if (ioc->busy_level != prev_busy_level || nr_lagging)
- trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+ trace_iocost_ioc_vrate_adj(ioc, vrate,
missed_ppm, rq_wait_pct,
nr_lagging, nr_shortages);
@@ -1018,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
unsigned seq;
+ u64 vrate;
now->now_ns = ktime_get();
now->now = ktime_to_us(now->now_ns);
- now->vrate = atomic64_read(&ioc->vtime_rate);
+ vrate = atomic64_read(&ioc->vtime_rate);
/*
* The current vtime is
@@ -1034,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
do {
seq = read_seqcount_begin(&ioc->period_seqcount);
now->vnow = ioc->period_at_vtime +
- (now->now - ioc->period_at) * now->vrate;
+ (now->now - ioc->period_at) * vrate;
} while (read_seqcount_retry(&ioc->period_seqcount, seq));
}
@@ -2203,8 +2205,8 @@ static void ioc_timer_fn(struct timer_list *timer)
LIST_HEAD(surpluses);
int nr_debtors, nr_shortages = 0, nr_lagging = 0;
u64 usage_us_sum = 0;
- u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
- u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ u32 ppm_rthr;
+ u32 ppm_wthr;
u32 missed_ppm[2], rq_wait_pct;
u64 period_vtime;
int prev_busy_level;
@@ -2215,6 +2217,8 @@ static void ioc_timer_fn(struct timer_list *timer)
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
+ ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
ioc_now(ioc, &now);
period_vtime = now.vnow - ioc->period_at_vtime;
@@ -2878,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk)
spin_unlock_irq(&ioc->lock);
/*
- * rqos must be added before activation to allow iocg_pd_init() to
+ * rqos must be added before activation to allow ioc_pd_init() to
* lookup the ioc from q. This means that the rqos methods may get
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
@@ -3187,11 +3191,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
+ blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue(disk->queue);
+
spin_lock_irq(&ioc->lock);
memcpy(qos, ioc->params.qos, sizeof(qos));
enable = ioc->enabled;
user = ioc->user_qos_params;
- spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@@ -3258,15 +3264,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (qos[QOS_MIN] > qos[QOS_MAX])
goto einval;
- spin_lock_irq(&ioc->lock);
-
if (enable) {
blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
+ wbt_disable_default(disk->queue);
} else {
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
+ wbt_enable_default(disk->queue);
}
if (user) {
@@ -3279,9 +3285,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
blkdev_put_no_open(bdev);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);
@@ -3336,6 +3350,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
+ struct request_queue *q;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@@ -3346,18 +3361,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ q = bdev_get_queue(bdev);
+ ioc = q_to_ioc(q);
if (!ioc) {
ret = blk_iocost_init(bdev->bd_disk);
if (ret)
goto err;
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ ioc = q_to_ioc(q);
}
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
spin_lock_irq(&ioc->lock);
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
- spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@@ -3394,7 +3412,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
user = true;
}
- spin_lock_irq(&ioc->lock);
if (user) {
memcpy(ioc->params.i_lcoefs, u, sizeof(u));
ioc->user_cost_model = true;
@@ -3404,10 +3421,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
blkdev_put_no_open(bdev);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 571fa95aafe9..778a0057193e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -141,7 +141,7 @@ struct iolatency_grp {
struct latency_stat __percpu *stats;
struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
- struct rq_depth rq_depth;
+ unsigned int max_depth;
struct rq_wait rq_wait;
atomic64_t window_start;
atomic_t scale_cookie;
@@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
{
struct iolatency_grp *iolat = private_data;
- return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+ return rq_wait_inc_below(rqw, iolat->max_depth);
}
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
@@ -364,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
}
/*
- * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
+ * Change the queue depth of the iolatency_grp. We add 1/16th of the
* queue depth at a time so we don't get wild swings and hopefully dial in to
- * fairer distribution of the overall queue depth.
+ * fairer distribution of the overall queue depth. We halve the queue depth
+ * at a time so we can scale down queue depth quickly from default unlimited
+ * to target.
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
- unsigned long old = iolat->rq_depth.max_depth;
+ unsigned long old = iolat->max_depth;
if (old > qd)
old = qd;
@@ -384,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
if (old < qd) {
old += scale;
old = min(old, qd);
- iolat->rq_depth.max_depth = old;
+ iolat->max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
} else {
old >>= 1;
- iolat->rq_depth.max_depth = max(old, 1UL);
+ iolat->max_depth = max(old, 1UL);
}
}
@@ -403,9 +405,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
u64 scale_lat;
int direction = 0;
- if (lat_to_blkg(iolat)->parent == NULL)
- return;
-
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
if (!parent)
return;
@@ -445,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
}
/* We're as low as we can go. */
- if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+ if (iolat->max_depth == 1 && direction < 0) {
blkcg_use_delay(lat_to_blkg(iolat));
return;
}
@@ -453,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
/* We're back to the default cookie, unthrottle all the things. */
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
blkcg_clear_delay(lat_to_blkg(iolat));
- iolat->rq_depth.max_depth = UINT_MAX;
+ iolat->max_depth = UINT_MAX;
wake_up_all(&iolat->rq_wait.wait);
return;
}
@@ -508,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
* We don't want to count issue_as_root bio's in the cgroups latency
* statistics as it could skew the numbers downwards.
*/
- if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+ if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
u64 sub = iolat->min_lat_nsec;
if (req_time < sub)
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
@@ -920,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
}
preempt_enable();
- if (iolat->rq_depth.max_depth == UINT_MAX)
+ if (iolat->max_depth == UINT_MAX)
seq_printf(s, " missed=%llu total=%llu depth=max",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total);
@@ -928,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
seq_printf(s, " missed=%llu total=%llu depth=%u",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total,
- iolat->rq_depth.max_depth);
+ iolat->max_depth);
}
static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
@@ -945,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
- if (iolat->rq_depth.max_depth == UINT_MAX)
+ if (iolat->max_depth == UINT_MAX)
seq_printf(s, " depth=max avg_lat=%llu win=%llu",
avg_lat, cur_win);
else
seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
- iolat->rq_depth.max_depth, avg_lat, cur_win);
+ iolat->max_depth, avg_lat, cur_win);
}
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
@@ -994,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
- iolat->rq_depth.queue_depth = blkg->q->nr_requests;
- iolat->rq_depth.max_depth = UINT_MAX;
- iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+ iolat->max_depth = UINT_MAX;
iolat->blkiolat = blkiolat;
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
atomic64_set(&iolat->window_start, now);
diff --git a/block/blk-map.c b/block/blk-map.c
index 34735626b00f..19940c978c73 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
{
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
+ unsigned int gup_flags = 0;
struct bio *bio;
int ret;
int j;
@@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (bio == NULL)
return -ENOMEM;
+ if (blk_queue_pci_p2pdma(rq->q))
+ gup_flags |= FOLL_PCI_P2PDMA;
+
while (iov_iter_count(iter)) {
struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes;
@@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
pages = stack_pages;
- bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
- nr_vecs, &offs);
+ bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
+ nr_vecs, &offs, gup_flags);
} else {
- bytes = iov_iter_get_pages_alloc2(iter, &pages,
- LONG_MAX, &offs);
+ bytes = iov_iter_get_pages_alloc(iter, &pages,
+ LONG_MAX, &offs, gup_flags);
}
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
@@ -555,7 +559,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
size_t nr_iter = iov_iter_count(iter);
size_t nr_segs = iter->nr_segs;
struct bio_vec *bvecs, *bvprvp = NULL;
- struct queue_limits *lim = &q->limits;
+ const struct queue_limits *lim = &q->limits;
unsigned int nsegs = 0, bytes = 0;
struct bio *bio;
size_t i;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ff04e9290715..35a8f75cc45d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
* is defined as 'unsigned int', meantime it has to be aligned to with the
* logical block size, which is the minimum accepted unit by hardware.
*/
-static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
+static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
-static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+static struct bio *bio_split_discard(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
}
static struct bio *bio_split_write_zeroes(struct bio *bio,
- struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
if (!lim->max_write_zeroes_sectors)
@@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct bio *bio,
- struct queue_limits *lim)
+ const struct queue_limits *lim)
{
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
@@ -184,7 +186,15 @@ static inline unsigned get_max_io_size(struct bio *bio,
return max_sectors & ~(lbs - 1);
}
-static inline unsigned get_max_segment_size(struct queue_limits *lim,
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @start_page: See below.
+ * @offset: Offset from @start_page where to add a segment.
+ *
+ * Returns the maximum number of bytes that can be added as a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
struct page *start_page, unsigned long offset)
{
unsigned long mask = lim->seg_boundary_mask;
@@ -192,11 +202,10 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
offset = mask & (page_to_phys(start_page) + offset);
/*
- * overflow may be triggered in case of zero page physical address
- * on 32bit arch, use queue's max segment size when that happens.
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
*/
- return min_not_zero(mask - offset + 1,
- (unsigned long)lim->max_segment_size);
+ return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
}
/**
@@ -219,9 +228,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
-static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
- unsigned *nsegs, unsigned *bytes, unsigned max_segs,
- unsigned max_bytes)
+static bool bvec_split_segs(const struct queue_limits *lim,
+ const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
+ unsigned max_segs, unsigned max_bytes)
{
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
@@ -267,7 +276,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
+static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
@@ -331,8 +340,9 @@ split:
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
-struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
- unsigned int *nr_segs)
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs)
{
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
struct bio *split;
@@ -377,7 +387,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
*/
struct bio *bio_split_to_limits(struct bio *bio)
{
- struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
+ const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
if (bio_may_exceed_limits(bio, lim))
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a4f7c101b53b..23d1a90fec42 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
+/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
unsigned int flags = q->tag_set->flags;
@@ -563,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
- if (!e) {
- blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- return 0;
- }
-
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 93997d297d42..4515288fbe35 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
- int i, ret;
+ int i, j, ret;
if (!hctx->nr_ctx)
return 0;
@@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
- break;
+ goto out;
}
+ return 0;
+out:
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (j < i)
+ kobject_del(&ctx->kobj);
+ }
+ kobject_del(&hctx->kobj);
return ret;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 228a6696d835..c5cf0dbca1db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
- * @q: request queue.
+ * @set: tag_set to wait on
*
* Note: it is driver's responsibility for making sure that quiesce has
- * been started.
+ * been started on or more of the request_queues of the tag_set. This
+ * function only waits for the quiesce on those request_queues that had
+ * the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/
-void blk_mq_wait_quiesce_done(struct request_queue *q)
+void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
- if (blk_queue_has_srcu(q))
- synchronize_srcu(q->srcu);
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ synchronize_srcu(set->srcu);
else
synchronize_rcu();
}
@@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
- blk_mq_wait_quiesce_done(q);
+ /* nothing to wait for non-mq queues */
+ if (queue_is_mq(q))
+ blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_quiesce_queue_nowait(q);
+ }
+ blk_mq_wait_quiesce_done(set);
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_unquiesce_queue(q);
+ }
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
@@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
+
if (rq_list_empty(plug->cached_rq)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
- if (rq)
- goto got_it;
- return NULL;
- }
- rq = rq_list_peek(&plug->cached_rq);
- if (!rq || rq->q != q)
- return NULL;
+ if (!rq)
+ return NULL;
+ } else {
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
- if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
- return NULL;
- if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
- return NULL;
+ if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+
+ plug->cached_rq = rq_list_next(rq);
+ }
- plug->cached_rq = rq_list_next(rq);
-got_it:
rq->cmd_flags = opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
@@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req)
blk_add_timer(req);
}
-static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
+struct blk_expired_data {
+ bool has_timedout_rq;
+ unsigned long next;
+ unsigned long timeout_start;
+};
+
+static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
unsigned long deadline;
@@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
deadline = READ_ONCE(rq->deadline);
- if (time_after_eq(jiffies, deadline))
+ if (time_after_eq(expired->timeout_start, deadline))
return true;
- if (*next == 0)
- *next = deadline;
- else if (time_after(*next, deadline))
- *next = deadline;
+ if (expired->next == 0)
+ expired->next = deadline;
+ else if (time_after(expired->next, deadline))
+ expired->next = deadline;
return false;
}
@@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq)
static bool blk_mq_check_expired(struct request *rq, void *priv)
{
- unsigned long *next = priv;
+ struct blk_expired_data *expired = priv;
/*
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
@@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv)
* it was completed and reallocated as a new request after returning
* from blk_mq_check_expired().
*/
- if (blk_mq_req_expired(rq, next))
+ if (blk_mq_req_expired(rq, expired)) {
+ expired->has_timedout_rq = true;
+ return false;
+ }
+ return true;
+}
+
+static bool blk_mq_handle_expired(struct request *rq, void *priv)
+{
+ struct blk_expired_data *expired = priv;
+
+ if (blk_mq_req_expired(rq, expired))
blk_mq_rq_timed_out(rq);
return true;
}
@@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
- unsigned long next = 0;
+ struct blk_expired_data expired = {
+ .timeout_start = jiffies,
+ };
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+ /* check if there is any timed-out request */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
+ if (expired.has_timedout_rq) {
+ /*
+ * Before walking tags, we must ensure any submit started
+ * before the current time has finished. Since the submit
+ * uses srcu or rcu, wait for a synchronization point to
+ * ensure all running submits have finished
+ */
+ blk_mq_wait_quiesce_done(q->tag_set);
+
+ expired.next = 0;
+ blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
+ }
- if (next != 0) {
- mod_timer(&q->timeout, next);
+ if (expired.next != 0) {
+ mod_timer(&q->timeout, expired.next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->rqs) {
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->rqs)
+ goto err_free_tags;
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->static_rqs) {
- kfree(tags->rqs);
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->static_rqs)
+ goto err_free_rqs;
return tags;
+
+err_free_rqs:
+ kfree(tags->rqs);
+err_free_tags:
+ blk_mq_free_tags(tags);
+ return NULL;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
@@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
+ q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
blk_queue_start_drain(q);
- blk_freeze_queue(q);
+ blk_mq_freeze_queue_wait(q);
blk_sync_queue(q);
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
@@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
@@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- WARN_ON_ONCE(blk_queue_has_srcu(q) !=
- !!(set->flags & BLK_MQ_F_BLOCKING));
-
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
- int cur_nr_hw_queues, int new_nr_hw_queues)
+ int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
- if (cur_nr_hw_queues >= new_nr_hw_queues)
- return 0;
+ if (set->nr_hw_queues >= new_nr_hw_queues)
+ goto done;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
return -ENOMEM;
if (set->tags)
- memcpy(new_tags, set->tags, cur_nr_hw_queues *
+ memcpy(new_tags, set->tags, set->nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
+done:
set->nr_hw_queues = new_nr_hw_queues;
-
return 0;
}
-static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
- int new_nr_hw_queues)
-{
- return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
-}
-
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
- return -ENOMEM;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
+ if (!set->srcu)
+ return -ENOMEM;
+ ret = init_srcu_struct(set->srcu);
+ if (ret)
+ goto out_free_srcu;
+ }
ret = -ENOMEM;
+ set->tags = kcalloc_node(set->nr_hw_queues,
+ sizeof(struct blk_mq_tags *), GFP_KERNEL,
+ set->numa_node);
+ if (!set->tags)
+ goto out_cleanup_srcu;
+
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
@@ -4437,6 +4503,12 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ cleanup_srcu_struct(set->srcu);
+out_free_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
@@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ cleanup_srcu_struct(set->srcu);
+ kfree(set->srcu);
+ }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
@@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
+ /* keep a reference to the elevator module as we'll switch back */
+ __elevator_get(qe->type);
list_add(&qe->node, head);
-
- /*
- * After elevator_switch, the previous elevator_queue will be
- * released by elevator_release. The reference of the io scheduler
- * module get by elevator_get will also be put. So we need to get
- * a reference of the io scheduler module here to prevent it to be
- * removed.
- */
- __module_get(qe->type->elevator_owner);
- elevator_switch(q, NULL);
+ elevator_disable(q);
mutex_unlock(&q->sysfs_lock);
return true;
@@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
+ /* drop the reference acquired in blk_mq_elv_switch_none */
+ elevator_put(t);
mutex_unlock(&q->sysfs_lock);
}
@@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
}
prev_nr_hw_queues = set->nr_hw_queues;
- if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
- 0)
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto reregister;
- set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
void blk_mq_cancel_work_sync(struct request_queue *q)
{
- if (queue_is_mq(q)) {
- struct blk_mq_hw_ctx *hctx;
- unsigned long i;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- cancel_delayed_work_sync(&q->requeue_work);
+ cancel_delayed_work_sync(&q->requeue_work);
- queue_for_each_hw_ctx(q, hctx, i)
- cancel_delayed_work_sync(&hctx->run_work);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
}
static int __init blk_mq_init(void)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 0b2870839cdd..ef59fee62780 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
- if (!blk_queue_has_srcu(q)) { \
- rcu_read_lock(); \
- (dispatch_ops); \
- rcu_read_unlock(); \
- } else { \
+ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \
int srcu_idx; \
\
might_sleep_if(check_sleep); \
- srcu_idx = srcu_read_lock((q)->srcu); \
+ srcu_idx = srcu_read_lock((q)->tag_set->srcu); \
(dispatch_ops); \
- srcu_read_unlock((q)->srcu, srcu_idx); \
+ srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
+ } else { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
} \
} while (0)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8ac1038d0c79..0477c4d527fe 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
}
EXPORT_SYMBOL(blk_queue_io_opt);
-static int queue_limit_alignment_offset(struct queue_limits *lim,
+static int queue_limit_alignment_offset(const struct queue_limits *lim,
sector_t sector)
{
unsigned int granularity = max(lim->physical_block_size, lim->io_min);
@@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim,
return (granularity + lim->alignment_offset - alignment) % granularity;
}
-static unsigned int queue_limit_discard_alignment(struct queue_limits *lim,
- sector_t sector)
+static unsigned int queue_limit_discard_alignment(
+ const struct queue_limits *lim, sector_t sector)
{
unsigned int alignment, granularity, offset;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e71b3b43927c..93d9e9c9a6ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
if (!wbt_rq_qos(q))
return -EINVAL;
+ if (wbt_disabled(q))
+ return sprintf(page, "0\n");
+
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
@@ -680,8 +683,8 @@ static struct attribute *queue_attrs[] = {
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
if (attr == &queue_io_timeout_entry.attr &&
(!q->mq_ops || !q->mq_ops->timeout))
@@ -707,8 +710,8 @@ static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->show)
@@ -724,68 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q;
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->store)
return -EIO;
- q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
-static void blk_free_queue_rcu(struct rcu_head *rcu_head)
-{
- struct request_queue *q = container_of(rcu_head, struct request_queue,
- rcu_head);
-
- kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
-}
-
-/**
- * blk_release_queue - releases all allocated resources of the request_queue
- * @kobj: pointer to a kobject, whose container is a request_queue
- *
- * This function releases all allocated resources of the request queue.
- *
- * The struct request_queue refcount is incremented with blk_get_queue() and
- * decremented with blk_put_queue(). Once the refcount reaches 0 this function
- * is called.
- *
- * Drivers exist which depend on the release of the request_queue to be
- * synchronous, it should not be deferred.
- *
- * Context: can sleep
- */
-static void blk_release_queue(struct kobject *kobj)
-{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
-
- might_sleep();
-
- percpu_ref_exit(&q->q_usage_counter);
-
- if (q->poll_stat)
- blk_stat_remove_callback(q, q->poll_cb);
- blk_stat_free_callback(q->poll_cb);
-
- blk_free_queue_stats(q->stats);
- kfree(q->poll_stat);
-
- if (queue_is_mq(q))
- blk_mq_release(q);
-
- if (blk_queue_has_srcu(q))
- cleanup_srcu_struct(q->srcu);
-
- ida_free(&blk_queue_ida, q->id);
- call_rcu(&q->rcu_head, blk_free_queue_rcu);
-}
-
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
@@ -796,12 +750,30 @@ static const struct attribute_group *blk_queue_attr_groups[] = {
NULL
};
-struct kobj_type blk_queue_ktype = {
+static void blk_queue_release(struct kobject *kobj)
+{
+ /* nothing to do here, all data is associated with the parent gendisk */
+}
+
+static struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
- .release = blk_release_queue,
+ .release = blk_queue_release,
};
+static void blk_debugfs_remove(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_trace_shutdown(q);
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->debugfs_dir = NULL;
+ q->sched_debugfs_dir = NULL;
+ q->rqos_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
+}
+
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
@@ -812,47 +784,47 @@ int blk_register_queue(struct gendisk *disk)
int ret;
mutex_lock(&q->sysfs_dir_lock);
-
- ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
+ kobject_init(&disk->queue_kobj, &blk_queue_ktype);
+ ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
- goto unlock;
+ goto out_put_queue_kobj;
- if (queue_is_mq(q))
- blk_mq_sysfs_register(disk);
+ if (queue_is_mq(q)) {
+ ret = blk_mq_sysfs_register(disk);
+ if (ret)
+ goto out_put_queue_kobj;
+ }
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->debugfs_mutex);
- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
+ q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
if (queue_is_mq(q))
blk_mq_debugfs_register(q);
mutex_unlock(&q->debugfs_mutex);
ret = disk_register_independent_access_ranges(disk);
if (ret)
- goto put_dev;
+ goto out_debugfs_remove;
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret)
- goto put_dev;
+ goto out_unregister_ia_ranges;
}
- ret = blk_crypto_sysfs_register(q);
+ ret = blk_crypto_sysfs_register(disk);
if (ret)
- goto put_dev;
+ goto out_elv_unregister;
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
- kobject_uevent(&q->kobj, KOBJ_ADD);
+ kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
if (q->elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
-
-unlock:
mutex_unlock(&q->sysfs_dir_lock);
/*
@@ -871,13 +843,16 @@ unlock:
return ret;
-put_dev:
+out_elv_unregister:
elv_unregister_queue(q);
+out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
+out_debugfs_remove:
+ blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
+out_put_queue_kobj:
+ kobject_put(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
- kobject_del(&q->kobj);
-
return ret;
}
@@ -915,7 +890,7 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
- blk_crypto_sysfs_unregister(q);
+ blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
elv_unregister_queue(q);
@@ -923,15 +898,9 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock);
/* Now that we've deleted all child objects, we can delete the queue. */
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
+ kobject_del(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
- mutex_lock(&q->debugfs_mutex);
- blk_trace_shutdown(q);
- debugfs_remove_recursive(q->debugfs_dir);
- q->debugfs_dir = NULL;
- q->sched_debugfs_dir = NULL;
- q->rqos_debugfs_dir = NULL;
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 847721dc2b2b..6fb5a2f9e1ee 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
/*
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
* make the IO dispatch more smooth.
- * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * Scale up: linearly scale up according to elapsed time since upgrade. For
* every throtl_slice, the limit scales up 1/2 .low limit till the
* limit hits .max limit
* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
@@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
* If on the default hierarchy, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
- * read_bps limit is set on the root group, the whole system can't
- * exceed 16M for the device.
+ * read_bps limit is set on a parent group, summary bps of
+ * parent group and its subtree groups can't exceed 16M for the
+ * device.
*
* If not on the default hierarchy, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
@@ -644,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
* that bandwidth. Do try to make use of that bandwidth while giving
* credit.
*/
- if (time_after_eq(start, tg->slice_start[rw]))
+ if (time_after(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
@@ -821,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg)
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
}
-static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
- u32 iops_limit, unsigned long *wait)
+static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
+ u32 iops_limit)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
jiffy_elapsed = jiffies - tg->slice_start[rw];
@@ -841,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
tg->carryover_ios[rw];
if (tg->io_disp[rw] + 1 <= io_allowed) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
-
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
-static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
- u64 bps_limit, unsigned long *wait)
+static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
+ u64 bps_limit)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes;
@@ -864,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -879,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
tg->carryover_bytes[rw];
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
/* Calc approx time to dispatch */
@@ -896,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
* up we did. Add that time also.
*/
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
/*
@@ -946,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
- tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
return true;
@@ -1066,7 +1055,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
- bio_set_flag(bio, BIO_BPS_THROTTLED);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -1079,6 +1067,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
+ bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
@@ -1737,7 +1726,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
- blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ continue;
+
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
@@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
return min(rtime, wtime);
}
-/* tg should not be an intermediate node */
static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq;
@@ -1816,24 +1815,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
return ret;
}
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
- bool read_limit, write_limit;
+ bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
/*
- * if cgroup reaches low limit (if low limit is 0, the cgroup always
- * reaches), it's ok to upgrade to next limit
+ * if low limit is zero, low limit is always reached.
+ * if low limit is non-zero, we can check if there is any request
+ * is queued to determine if low limit is reached as we throttle
+ * request according to limit.
*/
- read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
- write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
- if (!read_limit && !write_limit)
- return true;
- if (read_limit && sq->nr_queued[READ] &&
- (!write_limit || sq->nr_queued[WRITE]))
- return true;
- if (write_limit && sq->nr_queued[WRITE] &&
- (!read_limit || sq->nr_queued[READ]))
+ return !limit || sq->nr_queued[rw];
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ /*
+ * cgroup reaches low limit when low limit of READ and WRITE are
+ * both reached, it's ok to upgrade to next limit if cgroup reaches
+ * low limit
+ */
+ if (throtl_low_limit_reached(tg, READ) &&
+ throtl_low_limit_reached(tg, WRITE))
return true;
if (time_after_eq(jiffies,
@@ -1951,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
* If cgroup is below low limit, consider downgrade and throttle other
* cgroups
*/
- if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
- time_after_eq(now, tg_last_low_overflow_time(tg) +
+ if (time_after_eq(now, tg_last_low_overflow_time(tg) +
td->throtl_slice) &&
(!throtl_tg_is_idle(tg) ||
!list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
@@ -1962,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
{
+ struct throtl_data *td = tg->td;
+
+ if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
+ return false;
+
while (true) {
if (!throtl_tg_can_downgrade(tg))
return false;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index c293e08b301f..68a774d7a7c9 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -27,6 +27,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
+#include "elevator.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
+bool wbt_disabled(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+
+ return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
+ RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
+}
+
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
struct rq_qos *rqos = wbt_rq_qos(q);
if (!rqos)
return;
+
RQWB(rqos)->min_lat_nsec = val;
- RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ if (val)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ else
+ RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
+
wbt_update_limits(RQWB(rqos));
}
@@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
*/
void wbt_enable_default(struct request_queue *q)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_qos *rqos;
+ bool disable_flag = q->elevator &&
+ test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
/* Throttling already enabled? */
+ rqos = wbt_rq_qos(q);
if (rqos) {
- if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ if (!disable_flag &&
+ RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
}
@@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q)
if (!blk_queue_registered(q))
return;
- if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
+ if (queue_is_mq(q) && !disable_flag)
wbt_init(q);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 7e44eccc676d..e3ea6e7e2900 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -28,13 +28,15 @@ enum {
};
/*
- * Enable states. Either off, or on by default (done at init time),
- * or on through manual setup in sysfs.
+ * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
+ * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
+ * to WBT_STATE_OFF/ON_MANUAL.
*/
enum {
- WBT_STATE_ON_DEFAULT = 1,
- WBT_STATE_ON_MANUAL = 2,
- WBT_STATE_OFF_DEFAULT
+ WBT_STATE_ON_DEFAULT = 1, /* on by default */
+ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
+ WBT_STATE_OFF_DEFAULT = 3, /* off by default */
+ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
};
struct rq_wb {
@@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
+bool wbt_disabled(struct request_queue *);
void wbt_set_write_cache(struct request_queue *, bool);
@@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q)
{
return 0;
}
+static inline bool wbt_disabled(struct request_queue *q)
+{
+ return true;
+}
#endif /* CONFIG_BLK_WBT */
diff --git a/block/blk.h b/block/blk.h
index a186ea20f39d..4c3b3325219a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -26,11 +26,6 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};
-extern struct kmem_cache *blk_requestq_cachep;
-extern struct kmem_cache *blk_requestq_srcu_cachep;
-extern struct kobj_type blk_queue_ktype;
-extern struct ida blk_queue_ida;
-
bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@@ -104,7 +99,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
-static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
+static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
return (offset & lim->virt_boundary_mask) ||
@@ -115,7 +110,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
-static inline bool bvec_gap_to_prev(struct queue_limits *lim,
+static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
if (!lim->virt_boundary_mask)
@@ -278,6 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
void blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
+void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
@@ -297,7 +293,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
static inline bool bio_may_exceed_limits(struct bio *bio,
- struct queue_limits *lim)
+ const struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@@ -320,8 +316,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
- unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -428,15 +425,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
-static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
-{
- if (srcu)
- return blk_requestq_srcu_cachep;
- return blk_requestq_cachep;
-}
-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+struct request_queue *blk_alloc_queue(int node_id);
-int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner);
int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index d6f5dcdce748..435c32373cd6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q)
bsg_unregister_queue(bset->bd);
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
@@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
return q;
out_cleanup_queue:
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:
diff --git a/block/bsg.c b/block/bsg.c
index 2ab1351eb082..8eba57b9bb46 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev)
void bsg_unregister_queue(struct bsg_device *bd)
{
- if (bd->queue->kobj.sd)
- sysfs_remove_link(&bd->queue->kobj, "bsg");
+ struct gendisk *disk = bd->queue->disk;
+
+ if (disk && disk->queue_kobj.sd)
+ sysfs_remove_link(&disk->queue_kobj, "bsg");
cdev_device_del(&bd->cdev, &bd->device);
put_device(&bd->device);
}
@@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
if (ret)
goto out_put_device;
- if (q->kobj.sd) {
- ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg");
+ if (q->disk && q->disk->queue_kobj.sd) {
+ ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj,
+ "bsg");
if (ret)
goto out_device_del;
}
diff --git a/block/elevator.c b/block/elevator.c
index bd71f0fc4e4b..adee58e48e2d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -57,7 +57,7 @@ static LIST_HEAD(elv_list);
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
*/
-static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
+static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
@@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
- return 1;
+ return true;
}
/*
@@ -83,78 +83,45 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(unsigned int elv_features,
- unsigned int required_features)
+static inline bool elv_support_features(struct request_queue *q,
+ const struct elevator_type *e)
{
- return (required_features & elv_features) == required_features;
+ return (q->required_elevator_features & e->elevator_features) ==
+ q->required_elevator_features;
}
/**
- * elevator_match - Test an elevator name and features
+ * elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
* @name: Elevator name to test
- * @required_features: Features that the elevator must provide
*
- * Return true if the elevator @e name matches @name and if @e provides all
- * the features specified by @required_features.
+ * Return true if the elevator @e's name or alias matches @name.
*/
-static bool elevator_match(const struct elevator_type *e, const char *name,
- unsigned int required_features)
+static bool elevator_match(const struct elevator_type *e, const char *name)
{
- if (!elv_support_features(e->elevator_features, required_features))
- return false;
- if (!strcmp(e->elevator_name, name))
- return true;
- if (e->elevator_alias && !strcmp(e->elevator_alias, name))
- return true;
-
- return false;
+ return !strcmp(e->elevator_name, name) ||
+ (e->elevator_alias && !strcmp(e->elevator_alias, name));
}
-/**
- * elevator_find - Find an elevator
- * @name: Name of the elevator to find
- * @required_features: Features that the elevator must provide
- *
- * Return the first registered scheduler with name @name and supporting the
- * features @required_features and NULL otherwise.
- */
-static struct elevator_type *elevator_find(const char *name,
- unsigned int required_features)
+static struct elevator_type *__elevator_find(const char *name)
{
struct elevator_type *e;
- list_for_each_entry(e, &elv_list, list) {
- if (elevator_match(e, name, required_features))
+ list_for_each_entry(e, &elv_list, list)
+ if (elevator_match(e, name))
return e;
- }
-
return NULL;
}
-static void elevator_put(struct elevator_type *e)
-{
- module_put(e->elevator_owner);
-}
-
-static struct elevator_type *elevator_get(struct request_queue *q,
- const char *name, bool try_loading)
+static struct elevator_type *elevator_find_get(struct request_queue *q,
+ const char *name)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
-
- e = elevator_find(name, q->required_elevator_features);
- if (!e && try_loading) {
- spin_unlock(&elv_list_lock);
- request_module("%s-iosched", name);
- spin_lock(&elv_list_lock);
- e = elevator_find(name, q->required_elevator_features);
- }
-
- if (e && !try_module_get(e->elevator_owner))
+ e = __elevator_find(name);
+ if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
e = NULL;
-
spin_unlock(&elv_list_lock);
return e;
}
@@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
if (unlikely(!eq))
return NULL;
+ __elevator_get(e);
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
@@ -499,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
lockdep_assert_held(&q->sysfs_lock);
- error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
+ error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
@@ -512,7 +480,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
- e->registered = 1;
+ set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
@@ -523,13 +491,9 @@ void elv_unregister_queue(struct request_queue *q)
lockdep_assert_held(&q->sysfs_lock);
- if (e && e->registered) {
- struct elevator_queue *e = q->elevator;
-
+ if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
-
- e->registered = 0;
}
}
@@ -555,7 +519,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name, 0)) {
+ if (__elevator_find(e->elevator_name)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -588,39 +552,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-static int elevator_switch_mq(struct request_queue *q,
- struct elevator_type *new_e)
-{
- int ret;
-
- lockdep_assert_held(&q->sysfs_lock);
-
- if (q->elevator) {
- elv_unregister_queue(q);
- elevator_exit(q);
- }
-
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out;
-
- if (new_e) {
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q);
- goto out;
- }
- }
-
- if (new_e)
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
- else
- blk_add_trace_msg(q, "elv switch: none");
-
-out:
- return ret;
-}
-
static inline bool elv_support_iosched(struct request_queue *q)
{
if (!queue_is_mq(q) ||
@@ -642,7 +573,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
- return elevator_get(q, "mq-deadline", false);
+ return elevator_find_get(q, "mq-deadline");
}
/*
@@ -656,14 +587,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q)
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(e->elevator_features,
- q->required_elevator_features)) {
+ if (elv_support_features(q, e)) {
found = e;
break;
}
}
- if (found && !try_module_get(found->elevator_owner))
+ if (found && !elevator_tryget(found))
found = NULL;
spin_unlock(&elv_list_lock);
@@ -713,115 +643,147 @@ void elevator_init_mq(struct request_queue *q)
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
- elevator_put(e);
}
+
+ elevator_put(e);
}
/*
- * switch to new_e io scheduler. be careful not to introduce deadlocks -
- * we don't free the old io scheduler, before we have allocated what we
- * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason.
+ * Switch to new_e io scheduler.
+ *
+ * If switching fails, we are most likely running out of memory and not able
+ * to restore the old io scheduler, so leaving the io scheduler being none.
*/
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
- int err;
+ int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
- err = elevator_switch_mq(q, new_e);
+ if (q->elevator) {
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ }
+
+ ret = blk_mq_init_sched(q, new_e);
+ if (ret)
+ goto out_unfreeze;
+
+ ret = elv_register_queue(q, true);
+ if (ret) {
+ elevator_exit(q);
+ goto out_unfreeze;
+ }
+ blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+out_unfreeze:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
- return err;
+ if (ret) {
+ pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
+ new_e->elevator_name);
+ }
+
+ return ret;
+}
+
+void elevator_disable(struct request_queue *q)
+{
+ lockdep_assert_held(&q->sysfs_lock);
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
+ blk_add_trace_msg(q, "elv switch: none");
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
}
/*
* Switch this queue to the given IO scheduler.
*/
-static int __elevator_change(struct request_queue *q, const char *name)
+static int elevator_change(struct request_queue *q, const char *elevator_name)
{
- char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
+ int ret;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
- /*
- * Special case for mq, turn off scheduling
- */
- if (!strncmp(name, "none", 4)) {
- if (!q->elevator)
- return 0;
- return elevator_switch(q, NULL);
+ if (!strncmp(elevator_name, "none", 4)) {
+ if (q->elevator)
+ elevator_disable(q);
+ return 0;
}
- strlcpy(elevator_name, name, sizeof(elevator_name));
- e = elevator_get(q, strstrip(elevator_name), true);
- if (!e)
- return -EINVAL;
-
- if (q->elevator &&
- elevator_match(q->elevator->type, elevator_name, 0)) {
- elevator_put(e);
+ if (q->elevator && elevator_match(q->elevator->type, elevator_name))
return 0;
- }
- return elevator_switch(q, e);
+ e = elevator_find_get(q, elevator_name);
+ if (!e) {
+ request_module("%s-iosched", elevator_name);
+ e = elevator_find_get(q, elevator_name);
+ if (!e)
+ return -EINVAL;
+ }
+ ret = elevator_switch(q, e);
+ elevator_put(e);
+ return ret;
}
-ssize_t elv_iosched_store(struct request_queue *q, const char *name,
+ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
size_t count)
{
+ char elevator_name[ELV_NAME_MAX];
int ret;
if (!elv_support_iosched(q))
return count;
- ret = __elevator_change(q, name);
+ strlcpy(elevator_name, buf, sizeof(elevator_name));
+ ret = elevator_change(q, strstrip(elevator_name));
if (!ret)
return count;
-
return ret;
}
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
- struct elevator_queue *e = q->elevator;
- struct elevator_type *elv = NULL;
- struct elevator_type *__e;
+ struct elevator_queue *eq = q->elevator;
+ struct elevator_type *cur = NULL, *e;
int len = 0;
- if (!queue_is_mq(q))
+ if (!elv_support_iosched(q))
return sprintf(name, "none\n");
- if (!q->elevator)
+ if (!q->elevator) {
len += sprintf(name+len, "[none] ");
- else
- elv = e->type;
+ } else {
+ len += sprintf(name+len, "none ");
+ cur = eq->type;
+ }
spin_lock(&elv_list_lock);
- list_for_each_entry(__e, &elv_list, list) {
- if (elv && elevator_match(elv, __e->elevator_name, 0)) {
- len += sprintf(name+len, "[%s] ", elv->elevator_name);
- continue;
- }
- if (elv_support_iosched(q) &&
- elevator_match(__e, __e->elevator_name,
- q->required_elevator_features))
- len += sprintf(name+len, "%s ", __e->elevator_name);
+ list_for_each_entry(e, &elv_list, list) {
+ if (e == cur)
+ len += sprintf(name+len, "[%s] ", e->elevator_name);
+ else if (elv_support_features(q, e))
+ len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
- if (q->elevator)
- len += sprintf(name+len, "none");
-
- len += sprintf(len+name, "\n");
+ len += sprintf(name+len, "\n");
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
index 3f0593b3bf9d..774a8f6b99e6 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -84,6 +84,21 @@ struct elevator_type
struct list_head list;
};
+static inline bool elevator_tryget(struct elevator_type *e)
+{
+ return try_module_get(e->elevator_owner);
+}
+
+static inline void __elevator_get(struct elevator_type *e)
+{
+ __module_get(e->elevator_owner);
+}
+
+static inline void elevator_put(struct elevator_type *e)
+{
+ module_put(e->elevator_owner);
+}
+
#define ELV_HASH_BITS 6
void elv_rqhash_del(struct request_queue *q, struct request *rq);
@@ -100,10 +115,13 @@ struct elevator_queue
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
- unsigned int registered:1;
+ unsigned long flags;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
+#define ELEVATOR_FLAG_REGISTERED 0
+#define ELEVATOR_FLAG_DISABLE_WBT 1
+
/*
* block elevator interface
*/
diff --git a/block/fops.c b/block/fops.c
index b90742595317..50d245e8c913 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
return ret;
}
-static int blkdev_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return generic_writepages(mapping, wbc);
-}
-
const struct address_space_operations def_blk_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
@@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = {
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = blkdev_writepages,
.direct_IO = blkdev_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
diff --git a/block/genhd.c b/block/genhd.c
index 0f9769db2de8..08f76135a637 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
}
EXPORT_SYMBOL_GPL(disk_uevent);
-int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner)
{
struct block_device *bdev;
@@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
+ /* Someone else has bdev exclusively open? */
+ if (disk->part0->bd_holder && disk->part0->bd_holder != owner)
+ return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
@@ -479,10 +482,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_put_holder_dir;
}
- ret = bd_register_pending_holders(disk);
- if (ret < 0)
- goto out_put_slave_dir;
-
ret = blk_register_queue(disk);
if (ret)
goto out_put_slave_dir;
@@ -500,7 +499,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
bdev_add(disk->part0, ddev->devt);
if (get_capacity(disk))
- disk_scan_partitions(disk, FMODE_READ);
+ disk_scan_partitions(disk, FMODE_READ, NULL);
/*
* Announce the disk and partitions after all partitions are
@@ -530,6 +529,7 @@ out_unregister_queue:
rq_qos_exit(disk->queue);
out_put_slave_dir:
kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
out_put_holder_dir:
kobject_put(disk->part0->bd_holder_dir);
out_del_integrity:
@@ -560,6 +560,11 @@ void blk_mark_disk_dead(struct gendisk *disk)
{
set_bit(GD_DEAD, &disk->state);
blk_queue_start_drain(disk->queue);
+
+ /*
+ * Stop buffered writers from dirtying pages that can't be written out.
+ */
+ set_capacity_and_notify(disk, 0);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
@@ -629,6 +634,7 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0->bd_holder_dir);
kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
part_stat_set_all(disk->part0, 0);
disk->part0->bd_stamp = 0;
@@ -643,7 +649,9 @@ void del_gendisk(struct gendisk *disk)
blk_sync_queue(q);
blk_flush_integrity();
- blk_mq_cancel_work_sync(q);
+
+ if (queue_is_mq(q))
+ blk_mq_cancel_work_sync(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
@@ -1193,21 +1201,10 @@ struct class block_class = {
.dev_uevent = block_uevent,
};
-static char *block_devnode(struct device *dev, umode_t *mode,
- kuid_t *uid, kgid_t *gid)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (disk->fops->devnode)
- return disk->fops->devnode(disk, mode);
- return NULL;
-}
-
const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
- .devnode = block_devnode,
};
#ifdef CONFIG_PROC_FS
@@ -1412,7 +1409,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node, false);
+ q = blk_alloc_queue(node);
if (!q)
return NULL;
diff --git a/block/holder.c b/block/holder.c
index 5283bc804cc1..37d18c13d958 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -4,7 +4,7 @@
struct bd_holder_disk {
struct list_head list;
- struct block_device *bdev;
+ struct kobject *holder_dir;
int refcnt;
};
@@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct bd_holder_disk *holder;
list_for_each_entry(holder, &disk->slave_bdevs, list)
- if (holder->bdev == bdev)
+ if (holder->holder_dir == bdev->bd_holder_dir)
return holder;
return NULL;
}
@@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to)
sysfs_remove_link(from, kobject_name(to));
}
-static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- int ret;
-
- ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
- if (ret)
- return ret;
- ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- if (ret)
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- return ret;
-}
-
/**
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
* @bdev: the claimed slave bdev
@@ -75,12 +62,30 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;
- mutex_lock(&disk->open_mutex);
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return -EINVAL;
+
+ if (bdev->bd_disk == disk)
+ return -EINVAL;
+
+ /*
+ * del_gendisk drops the initial reference to bd_holder_dir, so we
+ * need to keep our own here to allow for cleanup past that point.
+ */
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+ return -ENODEV;
+ }
+ kobject_get(bdev->bd_holder_dir);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+ mutex_lock(&disk->open_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
if (holder) {
+ kobject_put(bdev->bd_holder_dir);
holder->refcnt++;
goto out_unlock;
}
@@ -92,36 +97,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
}
INIT_LIST_HEAD(&holder->list);
- holder->bdev = bdev;
holder->refcnt = 1;
- if (disk->slave_dir) {
- ret = __link_disk_holder(bdev, disk);
- if (ret) {
- kfree(holder);
- goto out_unlock;
- }
- }
+ holder->holder_dir = bdev->bd_holder_dir;
+ ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
+ if (ret)
+ goto out_free_holder;
+ ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+ if (ret)
+ goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
- /*
- * del_gendisk drops the initial reference to bd_holder_dir, so we need
- * to keep our own here to allow for cleanup past that point.
- */
- kobject_get(bdev->bd_holder_dir);
+ mutex_unlock(&disk->open_mutex);
+ return 0;
+
+out_del_symlink:
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+out_free_holder:
+ kfree(holder);
out_unlock:
mutex_unlock(&disk->open_mutex);
+ if (ret)
+ kobject_put(bdev->bd_holder_dir);
return ret;
}
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
-static void __unlink_disk_holder(struct block_device *bdev,
- struct gendisk *disk)
-{
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
-}
-
/**
* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
* @bdev: the calimed slave bdev
@@ -136,36 +137,18 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
struct bd_holder_disk *holder;
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return;
+
mutex_lock(&disk->open_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- if (disk->slave_dir)
- __unlink_disk_holder(bdev, disk);
- kobject_put(bdev->bd_holder_dir);
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj);
+ kobject_put(holder->holder_dir);
list_del_init(&holder->list);
kfree(holder);
}
mutex_unlock(&disk->open_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-
-int bd_register_pending_holders(struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
- int ret;
-
- mutex_lock(&disk->open_mutex);
- list_for_each_entry(holder, &disk->slave_bdevs, list) {
- ret = __link_disk_holder(holder->bdev, disk);
- if (ret)
- goto out_undo;
- }
- mutex_unlock(&disk->open_mutex);
- return 0;
-
-out_undo:
- list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list)
- __unlink_disk_holder(holder->bdev, disk);
- mutex_unlock(&disk->open_mutex);
- return ret;
-}
diff --git a/block/ioctl.c b/block/ioctl.c
index 60121e89052b..96617512982e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
* user space. Note the separate arg/argp parameters that are needed
* to deal with the compat_ptr() conversion.
*/
-static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg, void __user *argp)
+static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd,
+ unsigned long arg, void __user *argp)
{
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
unsigned int max_sectors;
switch (cmd) {
@@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
- return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
+ return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL,
+ file);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
@@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
- ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
+ ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret != -ENOIOCTLCMD)
return ret;
@@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
- ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
+ ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 5639921dfa92..f10c2a0d18d4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -131,6 +131,20 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
+ * get the request before `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_earlier_request(struct request *rq)
+{
+ struct rb_node *node = rb_prev(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+/*
* get the request after `rq' in sector-sorted order
*/
static inline struct request *
@@ -278,6 +292,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
+ * Check if rq has a sequential request preceding it.
+ */
+static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
+{
+ struct request *prev = deadline_earlier_request(rq);
+
+ if (!prev)
+ return false;
+
+ return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
+}
+
+/*
+ * Skip all write requests that are sequential from @rq, even if we cross
+ * a zone boundary.
+ */
+static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
+ struct request *rq)
+{
+ sector_t pos = blk_rq_pos(rq);
+ sector_t skipped_sectors = 0;
+
+ while (rq) {
+ if (blk_rq_pos(rq) != pos + skipped_sectors)
+ break;
+ skipped_sectors += blk_rq_sectors(rq);
+ rq = deadline_latter_request(rq);
+ }
+
+ return rq;
+}
+
+/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
- if (blk_req_can_dispatch_to_zone(rq))
+ if (blk_req_can_dispatch_to_zone(rq) &&
+ (blk_queue_nonrot(rq->q) ||
+ !deadline_is_seq_write(dd, rq)))
goto out;
}
rq = NULL;
@@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
- rq = deadline_latter_request(rq);
+ if (blk_queue_nonrot(rq->q))
+ rq = deadline_latter_request(rq);
+ else
+ rq = deadline_skip_seq_writes(dd, rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
@@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
+static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio p;
+
+ for (p = 0; p <= DD_PRIO_MAX; p++)
+ if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
+ return true;
+
+ return false;
+}
+
/*
* Callback from inside blk_mq_free_request().
*
@@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
- if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ if (dd_has_write_work(rq->mq_hctx))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
}
}
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 9bdb833e5817..463873f61e01 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -2461,6 +2461,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
}
+static void opal_lock_check_for_saved_key(struct opal_dev *dev,
+ struct opal_lock_unlock *lk_unlk)
+{
+ struct opal_suspend_data *iter;
+
+ if (lk_unlk->l_state != OPAL_LK ||
+ lk_unlk->session.opal_key.key_len > 0)
+ return;
+
+ /*
+ * Usually when closing a crypto device (eg: dm-crypt with LUKS) the
+ * volume key is not required, as it requires root privileges anyway,
+ * and root can deny access to a disk in many ways regardless.
+ * Requiring the volume key to lock the device is a peculiarity of the
+ * OPAL specification. Given we might already have saved the key if
+ * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use
+ * that key to lock the device if no key was provided here, the
+ * locking range matches and the appropriate flag was passed with
+ * 'IOC_OPAL_SAVE'.
+ * This allows integrating OPAL with tools and libraries that are used
+ * to the common behaviour and do not ask for the volume key when
+ * closing a device.
+ */
+ setup_opal_dev(dev);
+ list_for_each_entry(iter, &dev->unlk_lst, node) {
+ if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) &&
+ iter->lr == lk_unlk->session.opal_key.lr &&
+ iter->unlk.session.opal_key.key_len > 0) {
+ lk_unlk->session.opal_key.key_len =
+ iter->unlk.session.opal_key.key_len;
+ memcpy(lk_unlk->session.opal_key.key,
+ iter->unlk.session.opal_key.key,
+ iter->unlk.session.opal_key.key_len);
+ break;
+ }
+ }
+}
+
static int opal_lock_unlock(struct opal_dev *dev,
struct opal_lock_unlock *lk_unlk)
{
@@ -2470,6 +2508,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
return -EINVAL;
mutex_lock(&dev->dev_lock);
+ opal_lock_check_for_saved_key(dev, lk_unlk);
ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);