summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig1
-rw-r--r--block/blk-cgroup.c6
-rw-r--r--block/blk-core.c129
-rw-r--r--block/blk-iocost.c58
-rw-r--r--block/blk-map.c9
-rw-r--r--block/blk-merge.c50
-rw-r--r--block/blk-mq-debugfs.c1
-rw-r--r--block/blk-mq-sched.c82
-rw-r--r--block/blk-mq.c43
-rw-r--r--block/blk-settings.c37
-rw-r--r--block/blk.h45
-rw-r--r--block/genhd.c37
-rw-r--r--block/ioctl.c150
-rw-r--r--block/partitions/core.c168
14 files changed, 410 insertions, 406 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 3bc76bb113a0..41cb34b0fcd1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -146,6 +146,7 @@ config BLK_CGROUP_IOLATENCY
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y
+ select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
---help---
Enabling this option enables the .weight interface for cost
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 930212c1a512..0ecc897b225c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1530,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
u64 old = atomic64_read(&blkg->delay_start);
+ /* negative use_delay means no scaling, see blkcg_set_delay() */
+ if (atomic_read(&blkg->use_delay) < 0)
+ return;
+
/*
* We only want to scale down every second. The idea here is that we
* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
@@ -1717,6 +1721,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
*/
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
blkcg_scale_delay(blkg, now);
atomic64_add(delta, &blkg->delay_nsec);
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 7e4a1da0715e..7f11560bfddb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -440,6 +440,23 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
}
}
+static inline int bio_queue_enter(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ int ret;
+
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
+ if (unlikely(ret)) {
+ if (nowait && !blk_queue_dying(q))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ }
+
+ return ret;
+}
+
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
@@ -963,12 +980,13 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
+ * Various block parts want %current->io_context, so allocate it up
+ * front rather than dealing with lots of pain to allocate it only
+ * where needed. This may fail and the block layer knows how to live
+ * with it.
*/
- create_io_context(GFP_ATOMIC, q->node);
+ if (unlikely(!current->io_context))
+ create_task_io_context(current, GFP_ATOMIC, q->node);
if (!blkcg_bio_issue_check(q, bio))
return false;
@@ -991,28 +1009,13 @@ end_io:
}
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status. The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
blk_qc_t generic_make_request(struct bio *bio)
{
@@ -1063,16 +1066,17 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
- blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
- BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, flags) == 0)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = q->make_request_fn(q, bio);
+ if (q->make_request_fn)
+ ret = q->make_request_fn(q, bio);
+ else
+ ret = blk_mq_make_request(q, bio);
blk_queue_exit(q);
@@ -1090,12 +1094,6 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } else {
- if (unlikely(!blk_queue_dying(q) &&
- (bio->bi_opf & REQ_NOWAIT)))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
@@ -1112,28 +1110,22 @@ EXPORT_SYMBOL(generic_make_request);
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
- * to not call generic_make_request (or direct_make_request) again from
- * its make_request function. (Calling direct_make_request again from
- * a workqueue is perfectly fine as that doesn't recurse).
+ * to be blk-mq based.
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
blk_qc_t ret;
+ if (WARN_ON_ONCE(q->make_request_fn)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
-
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
- if (nowait && !blk_queue_dying(q))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
+ if (unlikely(bio_queue_enter(bio)))
return BLK_QC_T_NONE;
- }
-
- ret = q->make_request_fn(q, bio);
+ ret = blk_mq_make_request(q, bio);
blk_queue_exit(q);
return ret;
}
@@ -1143,17 +1135,17 @@ EXPORT_SYMBOL_GPL(direct_make_request);
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
+ * fully set up &struct bio that describes the I/O that needs to be done. The
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
*
+ * The success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
- bool workingset_read = false;
- unsigned long pflags;
- blk_qc_t ret;
-
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
@@ -1172,8 +1164,6 @@ blk_qc_t submit_bio(struct bio *bio)
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
- if (bio_flagged(bio, BIO_WORKINGSET))
- workingset_read = true;
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
@@ -1189,20 +1179,24 @@ blk_qc_t submit_bio(struct bio *bio)
}
/*
- * If we're reading data that is part of the userspace
- * workingset, count submission time as memory stall. When the
- * device is congested, or the submitting cgroup IO-throttled,
- * submission can be a significant part of overall IO time.
+ * If we're reading data that is part of the userspace workingset, count
+ * submission time as memory stall. When the device is congested, or
+ * the submitting cgroup IO-throttled, submission can be a significant
+ * part of overall IO time.
*/
- if (workingset_read)
- psi_memstall_enter(&pflags);
-
- ret = generic_make_request(bio);
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
+ bio_flagged(bio, BIO_WORKINGSET))) {
+ unsigned long pflags;
+ blk_qc_t ret;
- if (workingset_read)
+ psi_memstall_enter(&pflags);
+ ret = generic_make_request(bio);
psi_memstall_leave(&pflags);
- return ret;
+ return ret;
+ }
+
+ return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
@@ -1638,7 +1632,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
- rq->extra_len = rq_src->extra_len;
return 0;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 7c1fe605d0d6..12eefb3d113f 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -260,6 +260,7 @@ enum {
VTIME_PER_SEC_SHIFT = 37,
VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
+ VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
/* bound vrate adjustments within two orders of magnitude */
VRATE_MIN_PPM = 10000, /* 1% */
@@ -1206,14 +1207,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct blkcg_gq *blkg = iocg_to_blkg(iocg);
u64 vtime = atomic64_read(&iocg->vtime);
u64 vmargin = ioc->margin_us * now->vrate;
u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
- u64 expires, oexpires;
+ u64 delta_ns, expires, oexpires;
u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock);
@@ -1236,15 +1237,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
return false;
/* use delay */
- if (cost) {
- u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
- now->vrate);
- blkcg_add_delay(blkg, now->now_ns, cost_ns);
- }
- blkcg_use_delay(blkg);
-
- expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
- now->vrate) * NSEC_PER_USEC;
+ delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
+ now->vrate) * NSEC_PER_USEC;
+ blkcg_set_delay(blkg, delta_ns);
+ expires = now->now_ns + delta_ns;
/* if already active and close enough, don't bother */
oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
@@ -1265,7 +1261,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
spin_lock_irqsave(&iocg->waitq.lock, flags);
ioc_now(iocg->ioc, &now);
- iocg_kick_delay(iocg, &now, 0);
+ iocg_kick_delay(iocg, &now);
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
return HRTIMER_NORESTART;
@@ -1383,7 +1379,7 @@ static void ioc_timer_fn(struct timer_list *timer)
if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
/* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now);
- iocg_kick_delay(iocg, &now, 0);
+ iocg_kick_delay(iocg, &now);
} else if (iocg_is_idle(iocg)) {
/* no waiter and idle, deactivate */
iocg->last_inuse = iocg->inuse;
@@ -1678,6 +1674,31 @@ static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
return cost;
}
+static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
+ u64 *costp)
+{
+ unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
+ break;
+ case REQ_OP_WRITE:
+ *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
+ break;
+ default:
+ *costp = 0;
+ }
+}
+
+static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
+{
+ u64 cost;
+
+ calc_size_vtime_cost_builtin(rq, ioc, &cost);
+ return cost;
+}
+
static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
@@ -1762,7 +1783,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
*/
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
iocg->abs_vdebt += abs_cost;
- if (iocg_kick_delay(iocg, &now, cost))
+ if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
spin_unlock_irq(&iocg->waitq.lock);
@@ -1850,7 +1871,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
spin_lock_irqsave(&iocg->waitq.lock, flags);
if (likely(!list_empty(&iocg->active_list))) {
iocg->abs_vdebt += abs_cost;
- iocg_kick_delay(iocg, &now, cost);
+ iocg_kick_delay(iocg, &now);
} else {
iocg_commit_bio(iocg, bio, cost);
}
@@ -1868,7 +1889,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
{
struct ioc *ioc = rqos_to_ioc(rqos);
- u64 on_q_ns, rq_wait_ns;
+ u64 on_q_ns, rq_wait_ns, size_nsec;
int pidx, rw;
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
@@ -1889,8 +1910,10 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
+ size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
- if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
+ if (on_q_ns <= size_nsec ||
+ on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
else
this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
@@ -2297,6 +2320,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
spin_lock_irq(&ioc->lock);
if (enable) {
+ blk_stat_enable_accounting(ioc->rqos.q);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
ioc->enabled = true;
} else {
diff --git a/block/blk-map.c b/block/blk-map.c
index b72c361911a4..b6fa343fea9f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -654,8 +654,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
bio = rq->bio;
} while (iov_iter_count(&i));
- if (!bio_flagged(bio, BIO_USER_MAPPED))
- rq->rq_flags |= RQF_COPY_USER;
return 0;
unmap_rq:
@@ -731,7 +729,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
- int do_copy = 0;
struct bio *bio, *orig_bio;
int ret;
@@ -740,8 +737,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (!len || !kbuf)
return -EINVAL;
- do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
- if (do_copy)
+ if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
@@ -752,9 +748,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
- if (do_copy)
- rq->rq_flags |= RQF_COPY_USER;
-
orig_bio = bio;
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1534ed736363..a04e991b5ded 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -336,16 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
- /*
- * Since we're recursing into make_request here, ensure
- * that we mark this bio as already having entered the queue.
- * If not, and the queue is going away, we can get stuck
- * forever on waiting for the queue reference to drop. But
- * that will never happen, as we're already holding a
- * reference to it.
- */
- bio_set_flag(*bio, BIO_QUEUE_ENTERED);
-
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
@@ -519,44 +509,20 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
-int blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist)
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+ struct scatterlist *sglist, struct scatterlist **last_sg)
{
- struct scatterlist *sg = NULL;
int nsegs = 0;
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg);
+ nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
- nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg);
+ nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
-
- if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
- (blk_rq_bytes(rq) & q->dma_pad_mask)) {
- unsigned int pad_len =
- (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
-
- sg->length += pad_len;
- rq->extra_len += pad_len;
- }
-
- if (q->dma_drain_size && q->dma_drain_needed(rq)) {
- if (op_is_write(req_op(rq)))
- memset(q->dma_drain_buffer, 0, q->dma_drain_size);
-
- sg_unmark_end(sg);
- sg = sg_next(sg);
- sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
- q->dma_drain_size,
- ((unsigned long)q->dma_drain_buffer) &
- (PAGE_SIZE - 1));
- nsegs++;
- rq->extra_len += q->dma_drain_size;
- }
+ nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
- if (sg)
- sg_mark_end(sg);
+ if (*last_sg)
+ sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
@@ -566,7 +532,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
return nsegs;
}
-EXPORT_SYMBOL(blk_rq_map_sg);
+EXPORT_SYMBOL(__blk_rq_map_sg);
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..96b7a35c898a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -292,7 +292,6 @@ static const char *const rqf_name[] = {
RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
RQF_NAME(PREEMPT),
- RQF_NAME(COPY_USER),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
RQF_NAME(ELVPRIV),
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74cedea56034..fdcc2c1dd178 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -80,16 +80,22 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
blk_mq_run_hw_queue(hctx, true);
}
+#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
+
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ int ret = 0;
do {
struct request *rq;
@@ -97,12 +103,25 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!blk_mq_get_dispatch_budget(hctx))
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -113,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
*/
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ return ret;
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * to be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ int ret = 0;
do {
struct request *rq;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
@@ -149,6 +179,14 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -165,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx);
+ return ret;
}
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+ int ret = 0;
LIST_HEAD(rq_list);
- /* RCU or SRCU read lock is needed before checking quiesced flag */
- if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
- return;
-
- hctx->run++;
-
/*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
@@ -208,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
else
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched_dispatch) {
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
}
+
+ return ret;
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ /* RCU or SRCU read lock is needed before checking quiesced flag */
+ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * A return of -EAGAIN is an indication that hctx->dispatch is not
+ * empty and we must run again in order to avoid starving flushes.
+ */
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
+ blk_mq_run_hw_queue(hctx, true);
+ }
}
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7785df2c944..bcc3a2397d4a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -318,7 +318,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->nr_integrity_segments = 0;
#endif
/* tag was already set */
- rq->extra_len = 0;
WRITE_ONCE(rq->deadline, 0);
rq->timeout = 0;
@@ -667,15 +666,6 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
- /*
- * Make sure space for the drain appears. We know we can do
- * this because max_hw_segments has been adjusted to be one
- * fewer than the device can handle.
- */
- rq->nr_phys_segments++;
- }
-
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
@@ -695,8 +685,6 @@ static void __blk_mq_requeue_request(struct request *rq)
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
- if (q->dma_drain_size && blk_rq_bytes(rq))
- rq->nr_phys_segments--;
}
}
@@ -1206,6 +1194,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
+ bool no_budget_avail = false;
if (list_empty(list))
return false;
@@ -1224,6 +1213,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
hctx = rq->mq_hctx;
if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
blk_mq_put_driver_tag(rq);
+ no_budget_avail = true;
break;
}
@@ -1320,13 +1310,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
- * that could otherwise occur if the queue is idle.
+ * that could otherwise occur if the queue is idle. We'll do
+ * similar if we couldn't get budget and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx);
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
+ else if (needs_restart && (ret == BLK_STS_RESOURCE ||
+ no_budget_avail))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
@@ -1542,6 +1534,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
+ * @q: Pointer to the request queue to run.
+ * @msecs: Microseconds of delay to wait before running the queues.
+ */
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (blk_mq_hctx_stopped(hctx))
+ continue;
+
+ blk_mq_delay_run_hw_queue(hctx, msecs);
+ }
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
+
+/**
* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
* @q: request queue.
*
@@ -1973,7 +1984,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* Returns: Request queue cookie.
*/
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
@@ -2085,6 +2096,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
return cookie;
}
+EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2944,7 +2956,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- q->make_request_fn = blk_mq_make_request;
q->nr_requests = set->queue_depth;
/*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 14397b4c4b53..2ab1967b9716 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -652,43 +652,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
EXPORT_SYMBOL(blk_queue_update_dma_pad);
/**
- * blk_queue_dma_drain - Set up a drain buffer for excess dma.
- * @q: the request queue for the device
- * @dma_drain_needed: fn which returns non-zero if drain is necessary
- * @buf: physically contiguous buffer
- * @size: size of the buffer in bytes
- *
- * Some devices have excess DMA problems and can't simply discard (or
- * zero fill) the unwanted piece of the transfer. They have to have a
- * real area of memory to transfer it into. The use case for this is
- * ATAPI devices in DMA mode. If the packet command causes a transfer
- * bigger than the transfer size some HBAs will lock up if there
- * aren't DMA elements to contain the excess transfer. What this API
- * does is adjust the queue so that the buf is always appended
- * silently to the scatterlist.
- *
- * Note: This routine adjusts max_hw_segments to make room for appending
- * the drain buffer. If you call blk_queue_max_segments() after calling
- * this routine, you must set the limit to one fewer than your device
- * can support otherwise there won't be room for the drain buffer.
- */
-int blk_queue_dma_drain(struct request_queue *q,
- dma_drain_needed_fn *dma_drain_needed,
- void *buf, unsigned int size)
-{
- if (queue_max_segments(q) < 2)
- return -EINVAL;
- /* make room for appending the drain */
- blk_queue_max_segments(q, queue_max_segments(q) - 1);
- q->dma_drain_needed = dma_drain_needed;
- q->dma_drain_buffer = buf;
- q->dma_drain_size = size;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
-
-/**
* blk_queue_segment_boundary - set boundary rules for segment merging
* @q: the request queue for the device
* @mask: the memory boundary mask
diff --git a/block/blk.h b/block/blk.h
index 0a94ec68af32..73bd3b1c6938 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -303,26 +303,6 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/**
- * create_io_context - try to create task->io_context
- * @gfp_mask: allocation mask
- * @node: allocation node
- *
- * If %current->io_context is %NULL, allocate a new io_context and install
- * it. Returns the current %current->io_context which may be %NULL if
- * allocation failed.
- *
- * Note that this function can't be called with IRQ disabled because
- * task_lock which protects %current->io_context is IRQ-unsafe.
- */
-static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
-{
- WARN_ON_ONCE(irqs_disabled());
- if (unlikely(!current->io_context))
- create_task_io_context(current, gfp_mask, node);
- return current->io_context;
-}
-
/*
* Internal throttling interface
*/
@@ -389,20 +369,14 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
-struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno,
- sector_t start, sector_t len, int flags,
- struct partition_meta_info *info);
-void __delete_partition(struct percpu_ref *ref);
-void delete_partition(struct gendisk *disk, int partno);
+void delete_partition(struct gendisk *disk, struct hd_struct *part);
+int bdev_add_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
+int bdev_del_partition(struct block_device *bdev, int partno);
+int bdev_resize_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
int disk_expand_part_tbl(struct gendisk *disk, int target);
-
-static inline int hd_ref_init(struct hd_struct *part)
-{
- if (percpu_ref_init(&part->ref, __delete_partition, 0,
- GFP_KERNEL))
- return -ENOMEM;
- return 0;
-}
+int hd_ref_init(struct hd_struct *part);
static inline void hd_struct_get(struct hd_struct *part)
{
@@ -419,11 +393,6 @@ static inline void hd_struct_put(struct hd_struct *part)
percpu_ref_put(&part->ref);
}
-static inline void hd_struct_kill(struct hd_struct *part)
-{
- percpu_ref_kill(&part->ref);
-}
-
static inline void hd_free_part(struct hd_struct *part)
{
free_part_stats(part);
diff --git a/block/genhd.c b/block/genhd.c
index 06b642b23a07..c05d509877fa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -878,6 +878,25 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+static void invalidate_partition(struct gendisk *disk, int partno)
+{
+ struct block_device *bdev;
+
+ bdev = bdget_disk(disk, partno);
+ if (!bdev)
+ return;
+
+ fsync_bdev(bdev);
+ __invalidate_device(bdev, true);
+
+ /*
+ * Unhash the bdev inode for this device so that it gets evicted as soon
+ * as last inode reference is dropped.
+ */
+ remove_inode_hash(bdev->bd_inode);
+ bdput(bdev);
+}
+
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
@@ -896,13 +915,11 @@ void del_gendisk(struct gendisk *disk)
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno);
- bdev_unhash_inode(part_devt(part));
- delete_partition(disk, part->partno);
+ delete_partition(disk, part);
}
disk_part_iter_exit(&piter);
invalidate_partition(disk, 0);
- bdev_unhash_inode(disk_devt(disk));
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
up_write(&disk->lookup_sem);
@@ -1806,20 +1823,6 @@ int bdev_read_only(struct block_device *bdev)
EXPORT_SYMBOL(bdev_read_only);
-int invalidate_partition(struct gendisk *disk, int partno)
-{
- int res = 0;
- struct block_device *bdev = bdget_disk(disk, partno);
- if (bdev) {
- fsync_bdev(bdev);
- res = __invalidate_device(bdev, true);
- bdput(bdev);
- }
- return res;
-}
-
-EXPORT_SYMBOL(invalidate_partition);
-
/*
* Disk events - monitor disk events like media change and eject request.
*/
diff --git a/block/ioctl.c b/block/ioctl.c
index 6e827de1a4c4..75c64811b534 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -16,142 +16,44 @@
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
{
- struct block_device *bdevp;
- struct gendisk *disk;
- struct hd_struct *part, *lpart;
struct blkpg_partition p;
- struct disk_part_iter piter;
long long start, length;
- int partno;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
return -EFAULT;
- disk = bdev->bd_disk;
if (bdev != bdev->bd_contains)
return -EINVAL;
- partno = p.pno;
- if (partno <= 0)
+
+ if (p.pno <= 0)
return -EINVAL;
+
+ if (op == BLKPG_DEL_PARTITION)
+ return bdev_del_partition(bdev, p.pno);
+
+ start = p.start >> SECTOR_SHIFT;
+ length = p.length >> SECTOR_SHIFT;
+
+ /* check for fit in a hd_struct */
+ if (sizeof(sector_t) < sizeof(long long)) {
+ long pstart = start, plength = length;
+
+ if (pstart != start || plength != length || pstart < 0 ||
+ plength < 0 || p.pno > 65535)
+ return -EINVAL;
+ }
+
switch (op) {
- case BLKPG_ADD_PARTITION:
- start = p.start >> 9;
- length = p.length >> 9;
- /* check for fit in a hd_struct */
- if (sizeof(sector_t) == sizeof(long) &&
- sizeof(long long) > sizeof(long)) {
- long pstart = start, plength = length;
- if (pstart != start || plength != length
- || pstart < 0 || plength < 0 || partno > 65535)
- return -EINVAL;
- }
- /* check if partition is aligned to blocksize */
- if (p.start & (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
-
- mutex_lock(&bdev->bd_mutex);
-
- /* overlap? */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter))) {
- if (!(start + length <= part->start_sect ||
- start >= part->start_sect + part->nr_sects)) {
- disk_part_iter_exit(&piter);
- mutex_unlock(&bdev->bd_mutex);
- return -EBUSY;
- }
- }
- disk_part_iter_exit(&piter);
-
- /* all seems OK */
- part = add_partition(disk, partno, start, length,
- ADDPART_FLAG_NONE, NULL);
- mutex_unlock(&bdev->bd_mutex);
- return PTR_ERR_OR_ZERO(part);
- case BLKPG_DEL_PARTITION:
- part = disk_get_part(disk, partno);
- if (!part)
- return -ENXIO;
-
- bdevp = bdget(part_devt(part));
- disk_put_part(part);
- if (!bdevp)
- return -ENOMEM;
-
- mutex_lock(&bdevp->bd_mutex);
- if (bdevp->bd_openers) {
- mutex_unlock(&bdevp->bd_mutex);
- bdput(bdevp);
- return -EBUSY;
- }
- /* all seems OK */
- fsync_bdev(bdevp);
- invalidate_bdev(bdevp);
-
- mutex_lock_nested(&bdev->bd_mutex, 1);
- delete_partition(disk, partno);
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdevp->bd_mutex);
- bdput(bdevp);
-
- return 0;
- case BLKPG_RESIZE_PARTITION:
- start = p.start >> 9;
- /* new length of partition in bytes */
- length = p.length >> 9;
- /* check for fit in a hd_struct */
- if (sizeof(sector_t) == sizeof(long) &&
- sizeof(long long) > sizeof(long)) {
- long pstart = start, plength = length;
- if (pstart != start || plength != length
- || pstart < 0 || plength < 0)
- return -EINVAL;
- }
- part = disk_get_part(disk, partno);
- if (!part)
- return -ENXIO;
- bdevp = bdget(part_devt(part));
- if (!bdevp) {
- disk_put_part(part);
- return -ENOMEM;
- }
- mutex_lock(&bdevp->bd_mutex);
- mutex_lock_nested(&bdev->bd_mutex, 1);
- if (start != part->start_sect) {
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return -EINVAL;
- }
- /* overlap? */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY);
- while ((lpart = disk_part_iter_next(&piter))) {
- if (lpart->partno != partno &&
- !(start + length <= lpart->start_sect ||
- start >= lpart->start_sect + lpart->nr_sects)
- ) {
- disk_part_iter_exit(&piter);
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return -EBUSY;
- }
- }
- disk_part_iter_exit(&piter);
- part_nr_sects_write(part, (sector_t)length);
- i_size_write(bdevp->bd_inode, p.length);
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return 0;
- default:
+ case BLKPG_ADD_PARTITION:
+ /* check if partition is aligned to blocksize */
+ if (p.start & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ return bdev_add_partition(bdev, p.pno, start, length);
+ case BLKPG_RESIZE_PARTITION:
+ return bdev_resize_partition(bdev, p.pno, start, length);
+ default:
+ return -EINVAL;
}
}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 9ef48a8cff86..873999e2e2f2 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -274,10 +274,10 @@ struct device_type part_type = {
.uevent = part_uevent,
};
-static void delete_partition_work_fn(struct work_struct *work)
+static void hd_struct_free_work(struct work_struct *work)
{
- struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct,
- rcu_work);
+ struct hd_struct *part =
+ container_of(to_rcu_work(work), struct hd_struct, rcu_work);
part->start_sect = 0;
part->nr_sects = 0;
@@ -285,31 +285,31 @@ static void delete_partition_work_fn(struct work_struct *work)
put_device(part_to_dev(part));
}
-void __delete_partition(struct percpu_ref *ref)
+static void hd_struct_free(struct percpu_ref *ref)
{
struct hd_struct *part = container_of(ref, struct hd_struct, ref);
- INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
+
+ INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work);
queue_rcu_work(system_wq, &part->rcu_work);
}
+int hd_ref_init(struct hd_struct *part)
+{
+ if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL))
+ return -ENOMEM;
+ return 0;
+}
+
/*
* Must be called either with bd_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-void delete_partition(struct gendisk *disk, int partno)
+void delete_partition(struct gendisk *disk, struct hd_struct *part)
{
struct disk_part_tbl *ptbl =
rcu_dereference_protected(disk->part_tbl, 1);
- struct hd_struct *part;
- if (partno >= ptbl->len)
- return;
-
- part = rcu_dereference_protected(ptbl->part[partno], 1);
- if (!part)
- return;
-
- rcu_assign_pointer(ptbl->part[partno], NULL);
+ rcu_assign_pointer(ptbl->part[part->partno], NULL);
rcu_assign_pointer(ptbl->last_lookup, NULL);
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
@@ -321,7 +321,7 @@ void delete_partition(struct gendisk *disk, int partno)
* "in-use" until we really free the gendisk.
*/
blk_invalidate_devt(part_devt(part));
- hd_struct_kill(part);
+ percpu_ref_kill(&part->ref);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -335,7 +335,7 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
* Must be called either with bd_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-struct hd_struct *add_partition(struct gendisk *disk, int partno,
+static struct hd_struct *add_partition(struct gendisk *disk, int partno,
sector_t start, sector_t len, int flags,
struct partition_meta_info *info)
{
@@ -472,6 +472,121 @@ out_put:
return ERR_PTR(err);
}
+static bool partition_overlaps(struct gendisk *disk, sector_t start,
+ sector_t length, int skip_partno)
+{
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+ bool overlap = false;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ while ((part = disk_part_iter_next(&piter))) {
+ if (part->partno == skip_partno ||
+ start >= part->start_sect + part->nr_sects ||
+ start + length <= part->start_sect)
+ continue;
+ overlap = true;
+ break;
+ }
+
+ disk_part_iter_exit(&piter);
+ return overlap;
+}
+
+int bdev_add_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length)
+{
+ struct hd_struct *part;
+
+ mutex_lock(&bdev->bd_mutex);
+ if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
+ mutex_unlock(&bdev->bd_mutex);
+ return -EBUSY;
+ }
+
+ part = add_partition(bdev->bd_disk, partno, start, length,
+ ADDPART_FLAG_NONE, NULL);
+ mutex_unlock(&bdev->bd_mutex);
+ return PTR_ERR_OR_ZERO(part);
+}
+
+int bdev_del_partition(struct block_device *bdev, int partno)
+{
+ struct block_device *bdevp;
+ struct hd_struct *part;
+ int ret = 0;
+
+ part = disk_get_part(bdev->bd_disk, partno);
+ if (!part)
+ return -ENXIO;
+
+ ret = -ENOMEM;
+ bdevp = bdget(part_devt(part));
+ if (!bdevp)
+ goto out_put_part;
+
+ mutex_lock(&bdevp->bd_mutex);
+
+ ret = -EBUSY;
+ if (bdevp->bd_openers)
+ goto out_unlock;
+
+ sync_blockdev(bdevp);
+ invalidate_bdev(bdevp);
+
+ mutex_lock_nested(&bdev->bd_mutex, 1);
+ delete_partition(bdev->bd_disk, part);
+ mutex_unlock(&bdev->bd_mutex);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&bdevp->bd_mutex);
+ bdput(bdevp);
+out_put_part:
+ disk_put_part(part);
+ return ret;
+}
+
+int bdev_resize_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length)
+{
+ struct block_device *bdevp;
+ struct hd_struct *part;
+ int ret = 0;
+
+ part = disk_get_part(bdev->bd_disk, partno);
+ if (!part)
+ return -ENXIO;
+
+ ret = -ENOMEM;
+ bdevp = bdget(part_devt(part));
+ if (!bdevp)
+ goto out_put_part;
+
+ mutex_lock(&bdevp->bd_mutex);
+ mutex_lock_nested(&bdev->bd_mutex, 1);
+
+ ret = -EINVAL;
+ if (start != part->start_sect)
+ goto out_unlock;
+
+ ret = -EBUSY;
+ if (partition_overlaps(bdev->bd_disk, start, length, partno))
+ goto out_unlock;
+
+ part_nr_sects_write(part, (sector_t)length);
+ i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&bdevp->bd_mutex);
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdevp);
+out_put_part:
+ disk_put_part(part);
+ return ret;
+}
+
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
const struct block_device_operations *bdops = disk->fops;
@@ -488,27 +603,30 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
}
}
-int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev)
+int blk_drop_partitions(struct block_device *bdev)
{
struct disk_part_iter piter;
struct hd_struct *part;
- int res;
- if (!disk_part_scan_enabled(disk))
+ if (!disk_part_scan_enabled(bdev->bd_disk))
return 0;
if (bdev->bd_part_count)
return -EBUSY;
- res = invalidate_partition(disk, 0);
- if (res)
- return res;
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ sync_blockdev(bdev);
+ invalidate_bdev(bdev);
+
+ disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter)))
- delete_partition(disk, part->partno);
+ delete_partition(bdev->bd_disk, part);
disk_part_iter_exit(&piter);
return 0;
}
+#ifdef CONFIG_S390
+/* for historic reasons in the DASD driver */
+EXPORT_SYMBOL_GPL(blk_drop_partitions);
+#endif
static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
struct parsed_partitions *state, int p)