diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 6 | ||||
-rw-r--r-- | block/blk-core.c | 129 | ||||
-rw-r--r-- | block/blk-iocost.c | 58 | ||||
-rw-r--r-- | block/blk-map.c | 9 | ||||
-rw-r--r-- | block/blk-merge.c | 50 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 1 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 82 | ||||
-rw-r--r-- | block/blk-mq.c | 43 | ||||
-rw-r--r-- | block/blk-settings.c | 37 | ||||
-rw-r--r-- | block/blk.h | 45 | ||||
-rw-r--r-- | block/genhd.c | 37 | ||||
-rw-r--r-- | block/ioctl.c | 150 | ||||
-rw-r--r-- | block/partitions/core.c | 168 |
14 files changed, 410 insertions, 406 deletions
diff --git a/block/Kconfig b/block/Kconfig index 3bc76bb113a0..41cb34b0fcd1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -146,6 +146,7 @@ config BLK_CGROUP_IOLATENCY config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" depends on BLK_CGROUP=y + select BLK_RQ_IO_DATA_LEN select BLK_RQ_ALLOC_TIME ---help--- Enabling this option enables the .weight interface for cost diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 930212c1a512..0ecc897b225c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1530,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); + /* negative use_delay means no scaling, see blkcg_set_delay() */ + if (atomic_read(&blkg->use_delay) < 0) + return; + /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain @@ -1717,6 +1721,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } diff --git a/block/blk-core.c b/block/blk-core.c index 7e4a1da0715e..7f11560bfddb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -440,6 +440,23 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) } } +static inline int bio_queue_enter(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + bool nowait = bio->bi_opf & REQ_NOWAIT; + int ret; + + ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); + if (unlikely(ret)) { + if (nowait && !blk_queue_dying(q)) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + } + + return ret; +} + void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); @@ -963,12 +980,13 @@ generic_make_request_checks(struct bio *bio) } /* - * Various block parts want %current->io_context and lazy ioc - * allocation ends up trading a lot of pain for a small amount of - * memory. Just allocate it upfront. This may fail and block - * layer knows how to live with it. + * Various block parts want %current->io_context, so allocate it up + * front rather than dealing with lots of pain to allocate it only + * where needed. This may fail and the block layer knows how to live + * with it. */ - create_io_context(GFP_ATOMIC, q->node); + if (unlikely(!current->io_context)) + create_task_io_context(current, GFP_ATOMIC, q->node); if (!blkcg_bio_issue_check(q, bio)) return false; @@ -991,28 +1009,13 @@ end_io: } /** - * generic_make_request - hand a buffer to its device driver for I/O + * generic_make_request - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct bio, which describes the I/O that needs - * to be done. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bio->bi_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that bi_io_vec - * are set to describe the memory buffer, and that bi_dev and bi_sector are - * set to describe the device address, and the - * bi_end_io and optionally bi_private are set to describe how - * completion notification should be signaled. - * - * generic_make_request and the drivers it calls may use bi_next if this - * bio happens to be merged with someone else, and may resubmit the bio to - * a lower device by calling into generic_make_request recursively, which - * means the bio should NOT be touched after the call to ->make_request_fn. + * This is a version of submit_bio() that shall only be used for I/O that is + * resubmitted to lower level drivers by stacking block drivers. All file + * systems and other upper level users of the block layer should use + * submit_bio() instead. */ blk_qc_t generic_make_request(struct bio *bio) { @@ -1063,16 +1066,17 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; - blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? - BLK_MQ_REQ_NOWAIT : 0; - if (likely(blk_queue_enter(q, flags) == 0)) { + if (likely(bio_queue_enter(bio) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = q->make_request_fn(q, bio); + if (q->make_request_fn) + ret = q->make_request_fn(q, bio); + else + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); @@ -1090,12 +1094,6 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); - } else { - if (unlikely(!blk_queue_dying(q) && - (bio->bi_opf & REQ_NOWAIT))) - bio_wouldblock_error(bio); - else - bio_io_error(bio); } bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); @@ -1112,28 +1110,22 @@ EXPORT_SYMBOL(generic_make_request); * * This function behaves like generic_make_request(), but does not protect * against recursion. Must only be used if the called driver is known - * to not call generic_make_request (or direct_make_request) again from - * its make_request function. (Calling direct_make_request again from - * a workqueue is perfectly fine as that doesn't recurse). + * to be blk-mq based. */ blk_qc_t direct_make_request(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; - bool nowait = bio->bi_opf & REQ_NOWAIT; blk_qc_t ret; + if (WARN_ON_ONCE(q->make_request_fn)) { + bio_io_error(bio); + return BLK_QC_T_NONE; + } if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; - - if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { - if (nowait && !blk_queue_dying(q)) - bio_wouldblock_error(bio); - else - bio_io_error(bio); + if (unlikely(bio_queue_enter(bio))) return BLK_QC_T_NONE; - } - - ret = q->make_request_fn(q, bio); + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); return ret; } @@ -1143,17 +1135,17 @@ EXPORT_SYMBOL_GPL(direct_make_request); * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * - * submit_bio() is very similar in purpose to generic_make_request(), and - * uses that function to do most of the work. Both are fairly rough - * interfaces; @bio must be presetup and ready for I/O. + * submit_bio() is used to submit I/O requests to block devices. It is passed a + * fully set up &struct bio that describes the I/O that needs to be done. The + * bio will be send to the device described by the bi_disk and bi_partno fields. * + * The success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the ->bi_end_io() callback + * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * been called. */ blk_qc_t submit_bio(struct bio *bio) { - bool workingset_read = false; - unsigned long pflags; - blk_qc_t ret; - if (blkcg_punt_bio_submit(bio)) return BLK_QC_T_NONE; @@ -1172,8 +1164,6 @@ blk_qc_t submit_bio(struct bio *bio) if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { - if (bio_flagged(bio, BIO_WORKINGSET)) - workingset_read = true; task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } @@ -1189,20 +1179,24 @@ blk_qc_t submit_bio(struct bio *bio) } /* - * If we're reading data that is part of the userspace - * workingset, count submission time as memory stall. When the - * device is congested, or the submitting cgroup IO-throttled, - * submission can be a significant part of overall IO time. + * If we're reading data that is part of the userspace workingset, count + * submission time as memory stall. When the device is congested, or + * the submitting cgroup IO-throttled, submission can be a significant + * part of overall IO time. */ - if (workingset_read) - psi_memstall_enter(&pflags); - - ret = generic_make_request(bio); + if (unlikely(bio_op(bio) == REQ_OP_READ && + bio_flagged(bio, BIO_WORKINGSET))) { + unsigned long pflags; + blk_qc_t ret; - if (workingset_read) + psi_memstall_enter(&pflags); + ret = generic_make_request(bio); psi_memstall_leave(&pflags); - return ret; + return ret; + } + + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); @@ -1638,7 +1632,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; - rq->extra_len = rq_src->extra_len; return 0; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7c1fe605d0d6..12eefb3d113f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -260,6 +260,7 @@ enum { VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, + VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ @@ -1206,14 +1207,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 vmargin = ioc->margin_us * now->vrate; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 expires, oexpires; + u64 delta_ns, expires, oexpires; u32 hw_inuse; lockdep_assert_held(&iocg->waitq.lock); @@ -1236,15 +1237,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) return false; /* use delay */ - if (cost) { - u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, - now->vrate); - blkcg_add_delay(blkg, now->now_ns, cost_ns); - } - blkcg_use_delay(blkg); - - expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); @@ -1265,7 +1261,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) spin_lock_irqsave(&iocg->waitq.lock, flags); ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); spin_unlock_irqrestore(&iocg->waitq.lock, flags); return HRTIMER_NORESTART; @@ -1383,7 +1379,7 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1678,6 +1674,31 @@ static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) return cost; } +static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, + u64 *costp) +{ + unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; + + switch (req_op(rq)) { + case REQ_OP_READ: + *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + *costp = 0; + } +} + +static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) +{ + u64 cost; + + calc_size_vtime_cost_builtin(rq, ioc, &cost); + return cost; +} + static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; @@ -1762,7 +1783,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { iocg->abs_vdebt += abs_cost; - if (iocg_kick_delay(iocg, &now, cost)) + if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); spin_unlock_irq(&iocg->waitq.lock); @@ -1850,7 +1871,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, spin_lock_irqsave(&iocg->waitq.lock, flags); if (likely(!list_empty(&iocg->active_list))) { iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now, cost); + iocg_kick_delay(iocg, &now); } else { iocg_commit_bio(iocg, bio, cost); } @@ -1868,7 +1889,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); - u64 on_q_ns, rq_wait_ns; + u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) @@ -1889,8 +1910,10 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) on_q_ns = ktime_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; + size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); - if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC) + if (on_q_ns <= size_nsec || + on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); else this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); @@ -2297,6 +2320,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); if (enable) { + blk_stat_enable_accounting(ioc->rqos.q); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); ioc->enabled = true; } else { diff --git a/block/blk-map.c b/block/blk-map.c index b72c361911a4..b6fa343fea9f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -654,8 +654,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, bio = rq->bio; } while (iov_iter_count(&i)); - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: @@ -731,7 +729,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - int do_copy = 0; struct bio *bio, *orig_bio; int ret; @@ -740,8 +737,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); - if (do_copy) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -752,9 +748,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - if (do_copy) - rq->rq_flags |= RQF_COPY_USER; - orig_bio = bio; ret = blk_rq_append_bio(rq, &bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 1534ed736363..a04e991b5ded 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -336,16 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; - /* - * Since we're recursing into make_request here, ensure - * that we mark this bio as already having entered the queue. - * If not, and the queue is going away, we can get stuck - * forever on waiting for the queue reference to drop. But - * that will never happen, as we're already holding a - * reference to it. - */ - bio_set_flag(*bio, BIO_QUEUE_ENTERED); - bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); @@ -519,44 +509,20 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg) { - struct scatterlist *sg = NULL; int nsegs = 0; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg); + nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg); + nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - - if (unlikely(rq->rq_flags & RQF_COPY_USER) && - (blk_rq_bytes(rq) & q->dma_pad_mask)) { - unsigned int pad_len = - (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - - sg->length += pad_len; - rq->extra_len += pad_len; - } - - if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (op_is_write(req_op(rq))) - memset(q->dma_drain_buffer, 0, q->dma_drain_size); - - sg_unmark_end(sg); - sg = sg_next(sg); - sg_set_page(sg, virt_to_page(q->dma_drain_buffer), - q->dma_drain_size, - ((unsigned long)q->dma_drain_buffer) & - (PAGE_SIZE - 1)); - nsegs++; - rq->extra_len += q->dma_drain_size; - } + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); - if (sg) - sg_mark_end(sg); + if (*last_sg) + sg_mark_end(*last_sg); /* * Something must have been wrong if the figured number of @@ -566,7 +532,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, return nsegs; } -EXPORT_SYMBOL(blk_rq_map_sg); +EXPORT_SYMBOL(__blk_rq_map_sg); static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b3f2ba483992..96b7a35c898a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -292,7 +292,6 @@ static const char *const rqf_name[] = { RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(PREEMPT), - RQF_NAME(COPY_USER), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(ELVPRIV), diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 74cedea56034..fdcc2c1dd178 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -80,16 +80,22 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, true); } +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ + /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); + int ret = 0; do { struct request *rq; @@ -97,12 +103,25 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; + if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } + if (!blk_mq_get_dispatch_budget(hctx)) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } @@ -113,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) */ list_add(&rq->queuelist, &rq_list); } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -130,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * to be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + int ret = 0; do { struct request *rq; + if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } + if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; @@ -149,6 +179,14 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } @@ -165,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); WRITE_ONCE(hctx->dispatch_from, ctx); + return ret; } -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + int ret = 0; LIST_HEAD(rq_list); - /* RCU or SRCU read lock is needed before checking quiesced flag */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return; - - hctx->run++; - /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. @@ -208,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); + ret = blk_mq_do_dispatch_sched(hctx); else - blk_mq_do_dispatch_ctx(hctx); + ret = blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); + ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ - blk_mq_do_dispatch_ctx(hctx); + ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); } + + return ret; +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) + return; + + hctx->run++; + + /* + * A return of -EAGAIN is an indication that hctx->dispatch is not + * empty and we must run again in order to avoid starving flushes. + */ + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) + blk_mq_run_hw_queue(hctx, true); + } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq.c b/block/blk-mq.c index a7785df2c944..bcc3a2397d4a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -318,7 +318,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->nr_integrity_segments = 0; #endif /* tag was already set */ - rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; @@ -667,15 +666,6 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } - #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); @@ -695,8 +685,6 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } @@ -1206,6 +1194,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; + bool no_budget_avail = false; if (list_empty(list)) return false; @@ -1224,6 +1213,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, hctx = rq->mq_hctx; if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { blk_mq_put_driver_tag(rq); + no_budget_avail = true; break; } @@ -1320,13 +1310,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls - * that could otherwise occur if the queue is idle. + * that could otherwise occur if the queue is idle. We'll do + * similar if we couldn't get budget and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE)) + else if (needs_restart && (ret == BLK_STS_RESOURCE || + no_budget_avail)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -1542,6 +1534,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) EXPORT_SYMBOL(blk_mq_run_hw_queues); /** + * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. + * @q: Pointer to the request queue to run. + * @msecs: Microseconds of delay to wait before running the queues. + */ +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + + blk_mq_delay_run_hw_queue(hctx, msecs); + } +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); + +/** * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped * @q: request queue. * @@ -1973,7 +1984,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); @@ -2085,6 +2096,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; } +EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -2944,7 +2956,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 14397b4c4b53..2ab1967b9716 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -652,43 +652,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) EXPORT_SYMBOL(blk_queue_update_dma_pad); /** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * @q: the request queue for the device - * @dma_drain_needed: fn which returns non-zero if drain is necessary - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for appending - * the drain buffer. If you call blk_queue_max_segments() after calling - * this routine, you must set the limit to one fewer than your device - * can support otherwise there won't be room for the drain buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size) -{ - if (queue_max_segments(q) < 2) - return -EINVAL; - /* make room for appending the drain */ - blk_queue_max_segments(q, queue_max_segments(q) - 1); - q->dma_drain_needed = dma_drain_needed; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - -/** * blk_queue_segment_boundary - set boundary rules for segment merging * @q: the request queue for the device * @mask: the memory boundary mask diff --git a/block/blk.h b/block/blk.h index 0a94ec68af32..73bd3b1c6938 100644 --- a/block/blk.h +++ b/block/blk.h @@ -303,26 +303,6 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); -/** - * create_io_context - try to create task->io_context - * @gfp_mask: allocation mask - * @node: allocation node - * - * If %current->io_context is %NULL, allocate a new io_context and install - * it. Returns the current %current->io_context which may be %NULL if - * allocation failed. - * - * Note that this function can't be called with IRQ disabled because - * task_lock which protects %current->io_context is IRQ-unsafe. - */ -static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) -{ - WARN_ON_ONCE(irqs_disabled()); - if (unlikely(!current->io_context)) - create_task_io_context(current, gfp_mask, node); - return current->io_context; -} - /* * Internal throttling interface */ @@ -389,20 +369,14 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags, - struct partition_meta_info *info); -void __delete_partition(struct percpu_ref *ref); -void delete_partition(struct gendisk *disk, int partno); +void delete_partition(struct gendisk *disk, struct hd_struct *part); +int bdev_add_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length); +int bdev_del_partition(struct block_device *bdev, int partno); +int bdev_resize_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length); int disk_expand_part_tbl(struct gendisk *disk, int target); - -static inline int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, __delete_partition, 0, - GFP_KERNEL)) - return -ENOMEM; - return 0; -} +int hd_ref_init(struct hd_struct *part); static inline void hd_struct_get(struct hd_struct *part) { @@ -419,11 +393,6 @@ static inline void hd_struct_put(struct hd_struct *part) percpu_ref_put(&part->ref); } -static inline void hd_struct_kill(struct hd_struct *part) -{ - percpu_ref_kill(&part->ref); -} - static inline void hd_free_part(struct hd_struct *part) { free_part_stats(part); diff --git a/block/genhd.c b/block/genhd.c index 06b642b23a07..c05d509877fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -878,6 +878,25 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); +static void invalidate_partition(struct gendisk *disk, int partno) +{ + struct block_device *bdev; + + bdev = bdget_disk(disk, partno); + if (!bdev) + return; + + fsync_bdev(bdev); + __invalidate_device(bdev, true); + + /* + * Unhash the bdev inode for this device so that it gets evicted as soon + * as last inode reference is dropped. + */ + remove_inode_hash(bdev->bd_inode); + bdput(bdev); +} + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -896,13 +915,11 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - bdev_unhash_inode(part_devt(part)); - delete_partition(disk, part->partno); + delete_partition(disk, part); } disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); @@ -1806,20 +1823,6 @@ int bdev_read_only(struct block_device *bdev) EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int partno) -{ - int res = 0; - struct block_device *bdev = bdget_disk(disk, partno); - if (bdev) { - fsync_bdev(bdev); - res = __invalidate_device(bdev, true); - bdput(bdev); - } - return res; -} - -EXPORT_SYMBOL(invalidate_partition); - /* * Disk events - monitor disk events like media change and eject request. */ diff --git a/block/ioctl.c b/block/ioctl.c index 6e827de1a4c4..75c64811b534 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,142 +16,44 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { - struct block_device *bdevp; - struct gendisk *disk; - struct hd_struct *part, *lpart; struct blkpg_partition p; - struct disk_part_iter piter; long long start, length; - int partno; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; - disk = bdev->bd_disk; if (bdev != bdev->bd_contains) return -EINVAL; - partno = p.pno; - if (partno <= 0) + + if (p.pno <= 0) return -EINVAL; + + if (op == BLKPG_DEL_PARTITION) + return bdev_del_partition(bdev, p.pno); + + start = p.start >> SECTOR_SHIFT; + length = p.length >> SECTOR_SHIFT; + + /* check for fit in a hd_struct */ + if (sizeof(sector_t) < sizeof(long long)) { + long pstart = start, plength = length; + + if (pstart != start || plength != length || pstart < 0 || + plength < 0 || p.pno > 65535) + return -EINVAL; + } + switch (op) { - case BLKPG_ADD_PARTITION: - start = p.start >> 9; - length = p.length >> 9; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) == sizeof(long) && - sizeof(long long) > sizeof(long)) { - long pstart = start, plength = length; - if (pstart != start || plength != length - || pstart < 0 || plength < 0 || partno > 65535) - return -EINVAL; - } - /* check if partition is aligned to blocksize */ - if (p.start & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - mutex_lock(&bdev->bd_mutex); - - /* overlap? */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - if (!(start + length <= part->start_sect || - start >= part->start_sect + part->nr_sects)) { - disk_part_iter_exit(&piter); - mutex_unlock(&bdev->bd_mutex); - return -EBUSY; - } - } - disk_part_iter_exit(&piter); - - /* all seems OK */ - part = add_partition(disk, partno, start, length, - ADDPART_FLAG_NONE, NULL); - mutex_unlock(&bdev->bd_mutex); - return PTR_ERR_OR_ZERO(part); - case BLKPG_DEL_PARTITION: - part = disk_get_part(disk, partno); - if (!part) - return -ENXIO; - - bdevp = bdget(part_devt(part)); - disk_put_part(part); - if (!bdevp) - return -ENOMEM; - - mutex_lock(&bdevp->bd_mutex); - if (bdevp->bd_openers) { - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - return -EBUSY; - } - /* all seems OK */ - fsync_bdev(bdevp); - invalidate_bdev(bdevp); - - mutex_lock_nested(&bdev->bd_mutex, 1); - delete_partition(disk, partno); - mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - - return 0; - case BLKPG_RESIZE_PARTITION: - start = p.start >> 9; - /* new length of partition in bytes */ - length = p.length >> 9; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) == sizeof(long) && - sizeof(long long) > sizeof(long)) { - long pstart = start, plength = length; - if (pstart != start || plength != length - || pstart < 0 || plength < 0) - return -EINVAL; - } - part = disk_get_part(disk, partno); - if (!part) - return -ENXIO; - bdevp = bdget(part_devt(part)); - if (!bdevp) { - disk_put_part(part); - return -ENOMEM; - } - mutex_lock(&bdevp->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); - if (start != part->start_sect) { - mutex_unlock(&bdevp->bd_mutex); - mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); - disk_put_part(part); - return -EINVAL; - } - /* overlap? */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY); - while ((lpart = disk_part_iter_next(&piter))) { - if (lpart->partno != partno && - !(start + length <= lpart->start_sect || - start >= lpart->start_sect + lpart->nr_sects) - ) { - disk_part_iter_exit(&piter); - mutex_unlock(&bdevp->bd_mutex); - mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); - disk_put_part(part); - return -EBUSY; - } - } - disk_part_iter_exit(&piter); - part_nr_sects_write(part, (sector_t)length); - i_size_write(bdevp->bd_inode, p.length); - mutex_unlock(&bdevp->bd_mutex); - mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); - disk_put_part(part); - return 0; - default: + case BLKPG_ADD_PARTITION: + /* check if partition is aligned to blocksize */ + if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + return bdev_add_partition(bdev, p.pno, start, length); + case BLKPG_RESIZE_PARTITION: + return bdev_resize_partition(bdev, p.pno, start, length); + default: + return -EINVAL; } } diff --git a/block/partitions/core.c b/block/partitions/core.c index 9ef48a8cff86..873999e2e2f2 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -274,10 +274,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void delete_partition_work_fn(struct work_struct *work) +static void hd_struct_free_work(struct work_struct *work) { - struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, - rcu_work); + struct hd_struct *part = + container_of(to_rcu_work(work), struct hd_struct, rcu_work); part->start_sect = 0; part->nr_sects = 0; @@ -285,31 +285,31 @@ static void delete_partition_work_fn(struct work_struct *work) put_device(part_to_dev(part)); } -void __delete_partition(struct percpu_ref *ref) +static void hd_struct_free(struct percpu_ref *ref) { struct hd_struct *part = container_of(ref, struct hd_struct, ref); - INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); + + INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); queue_rcu_work(system_wq, &part->rcu_work); } +int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) + return -ENOMEM; + return 0; +} + /* * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct gendisk *disk, int partno) +void delete_partition(struct gendisk *disk, struct hd_struct *part) { struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - struct hd_struct *part; - if (partno >= ptbl->len) - return; - - part = rcu_dereference_protected(ptbl->part[partno], 1); - if (!part) - return; - - rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->part[part->partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -321,7 +321,7 @@ void delete_partition(struct gendisk *disk, int partno) * "in-use" until we really free the gendisk. */ blk_invalidate_devt(part_devt(part)); - hd_struct_kill(part); + percpu_ref_kill(&part->ref); } static ssize_t whole_disk_show(struct device *dev, @@ -335,7 +335,7 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct hd_struct *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { @@ -472,6 +472,121 @@ out_put: return ERR_PTR(err); } +static bool partition_overlaps(struct gendisk *disk, sector_t start, + sector_t length, int skip_partno) +{ + struct disk_part_iter piter; + struct hd_struct *part; + bool overlap = false; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) { + if (part->partno == skip_partno || + start >= part->start_sect + part->nr_sects || + start + length <= part->start_sect) + continue; + overlap = true; + break; + } + + disk_part_iter_exit(&piter); + return overlap; +} + +int bdev_add_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length) +{ + struct hd_struct *part; + + mutex_lock(&bdev->bd_mutex); + if (partition_overlaps(bdev->bd_disk, start, length, -1)) { + mutex_unlock(&bdev->bd_mutex); + return -EBUSY; + } + + part = add_partition(bdev->bd_disk, partno, start, length, + ADDPART_FLAG_NONE, NULL); + mutex_unlock(&bdev->bd_mutex); + return PTR_ERR_OR_ZERO(part); +} + +int bdev_del_partition(struct block_device *bdev, int partno) +{ + struct block_device *bdevp; + struct hd_struct *part; + int ret = 0; + + part = disk_get_part(bdev->bd_disk, partno); + if (!part) + return -ENXIO; + + ret = -ENOMEM; + bdevp = bdget(part_devt(part)); + if (!bdevp) + goto out_put_part; + + mutex_lock(&bdevp->bd_mutex); + + ret = -EBUSY; + if (bdevp->bd_openers) + goto out_unlock; + + sync_blockdev(bdevp); + invalidate_bdev(bdevp); + + mutex_lock_nested(&bdev->bd_mutex, 1); + delete_partition(bdev->bd_disk, part); + mutex_unlock(&bdev->bd_mutex); + + ret = 0; +out_unlock: + mutex_unlock(&bdevp->bd_mutex); + bdput(bdevp); +out_put_part: + disk_put_part(part); + return ret; +} + +int bdev_resize_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length) +{ + struct block_device *bdevp; + struct hd_struct *part; + int ret = 0; + + part = disk_get_part(bdev->bd_disk, partno); + if (!part) + return -ENXIO; + + ret = -ENOMEM; + bdevp = bdget(part_devt(part)); + if (!bdevp) + goto out_put_part; + + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + + ret = -EINVAL; + if (start != part->start_sect) + goto out_unlock; + + ret = -EBUSY; + if (partition_overlaps(bdev->bd_disk, start, length, partno)) + goto out_unlock; + + part_nr_sects_write(part, (sector_t)length); + i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT); + + ret = 0; +out_unlock: + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); +out_put_part: + disk_put_part(part); + return ret; +} + static bool disk_unlock_native_capacity(struct gendisk *disk) { const struct block_device_operations *bdops = disk->fops; @@ -488,27 +603,30 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; struct hd_struct *part; - int res; - if (!disk_part_scan_enabled(disk)) + if (!disk_part_scan_enabled(bdev->bd_disk)) return 0; if (bdev->bd_part_count) return -EBUSY; - res = invalidate_partition(disk, 0); - if (res) - return res; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + sync_blockdev(bdev); + invalidate_bdev(bdev); + + disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part->partno); + delete_partition(bdev->bd_disk, part); disk_part_iter_exit(&piter); return 0; } +#ifdef CONFIG_S390 +/* for historic reasons in the DASD driver */ +EXPORT_SYMBOL_GPL(blk_drop_partitions); +#endif static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, struct parsed_partitions *state, int p) |