diff options
Diffstat (limited to 'drivers/block')
30 files changed, 2697 insertions, 3998 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index e48b24be45ee..df38fb364904 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -256,49 +256,6 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config CDROM_PKTCDVD - tristate "Packet writing on CD/DVD media (DEPRECATED)" - depends on !UML - depends on SCSI - select CDROM - help - Note: This driver is deprecated and will be removed from the - kernel in the near future! - - If you have a CDROM/DVD drive that supports packet writing, say - Y to include support. It should work with any MMC/Mt Fuji - compliant ATAPI or SCSI drive, which is just about any newer - DVD/CD writer. - - Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs - is possible. - DVD-RW disks must be in restricted overwrite mode. - - See the file <file:Documentation/cdrom/packet-writing.rst> - for further information on the use of this driver. - - To compile this driver as a module, choose M here: the - module will be called pktcdvd. - -config CDROM_PKTCDVD_BUFFERS - int "Free buffers for data gathering" - depends on CDROM_PKTCDVD - default "8" - help - This controls the maximum number of active concurrent packets. More - concurrent packets can increase write performance, but also require - more memory. Each concurrent packet will require approximately 64Kb - of non-swappable kernel memory, memory which will be allocated when - a disc is opened for writing. - -config CDROM_PKTCDVD_WCACHE - bool "Enable write caching" - depends on CDROM_PKTCDVD - help - If enabled, write caching will be set for the CD-R/W device. For now - this option is dangerous unless the CD-RW media is known good, as we - don't do deferred write error handling yet. - config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET @@ -407,4 +364,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES source "drivers/block/rnbd/Kconfig" +config BLK_DEV_ZONED_LOOP + tristate "Zoned loopback device support" + depends on BLK_DEV_ZONED + help + Saying Y here will allow you to use create a zoned block device using + regular files for zones (one file per zones). This is useful to test + file systems, device mapper and applications that support zoned block + devices. To create a zoned loop device, no user utility is needed, a + zoned loop device can be created (or re-started) using a command + like: + + echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \ + /dev/zloop-control + + See Documentation/admin-guide/blockdev/zoned_loop.rst for usage + details. + + If unsure, say N. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 1105a2d4fdcb..a695ce74ef22 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o @@ -41,5 +40,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o +obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 749ae1246f4c..d35caa3c69e1 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -80,6 +80,7 @@ enum { DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */ DEVFL_FREED = (1<<8), /* device has been cleaned up */ + DEVFL_DEAD = (1<<9), /* device has timed out of aoe_deadsecs */ }; enum { diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 92b06d1de4cc..6298f8e271e3 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -745,7 +745,7 @@ rexmit_timer(struct timer_list *timer) int utgts; /* number of aoetgt descriptors (not slots) */ int since; - d = from_timer(d, timer, timer); + d = timer_container_of(d, timer, timer); spin_lock_irqsave(&d->lock, flags); @@ -754,7 +754,7 @@ rexmit_timer(struct timer_list *timer) utgts = count_targets(d, NULL); - if (d->flags & DEVFL_TKILL) { + if (d->flags & (DEVFL_TKILL | DEVFL_DEAD)) { spin_unlock_irqrestore(&d->lock, flags); return; } @@ -786,7 +786,8 @@ rexmit_timer(struct timer_list *timer) * to clean up. */ list_splice(&flist, &d->factive[0]); - aoedev_downdev(d); + d->flags |= DEVFL_DEAD; + queue_work(aoe_wq, &d->work); goto out; } @@ -898,6 +899,9 @@ aoecmd_sleepwork(struct work_struct *work) { struct aoedev *d = container_of(work, struct aoedev, work); + if (d->flags & DEVFL_DEAD) + aoedev_downdev(d); + if (d->flags & DEVFL_GDALLOC) aoeblk_gdalloc(d); diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 141b2a0e03f2..3a240755045b 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -149,7 +149,7 @@ dummy_timer(struct timer_list *t) { struct aoedev *d; - d = from_timer(d, t, timer); + d = timer_container_of(d, t, timer); if (d->flags & DEVFL_TKILL) return; d->timer.expires = jiffies + HZ; @@ -198,9 +198,13 @@ aoedev_downdev(struct aoedev *d) { struct aoetgt *t, **tt, **te; struct list_head *head, *pos, *nx; + struct request *rq, *rqnext; int i; + unsigned long flags; - d->flags &= ~DEVFL_UP; + spin_lock_irqsave(&d->lock, flags); + d->flags &= ~(DEVFL_UP | DEVFL_DEAD); + spin_unlock_irqrestore(&d->lock, flags); /* clean out active and to-be-retransmitted buffers */ for (i = 0; i < NFACTIVE; i++) { @@ -223,6 +227,13 @@ aoedev_downdev(struct aoedev *d) /* clean out the in-process request (if any) */ aoe_failip(d); + /* clean out any queued block requests */ + list_for_each_entry_safe(rq, rqnext, &d->rq_list, queuelist) { + list_del_init(&rq->queuelist); + blk_mq_start_request(rq); + blk_mq_end_request(rq, BLK_STS_IOERR); + } + /* fast fail all pending I/O */ if (d->blkq) { /* UP is cleared, freeze+quiesce to insure all are errored */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 292f127cae0a..0c2eabe14af3 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -54,32 +54,35 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) /* * Insert a new page for a given sector, if one does not already exist. */ -static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) +static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, + blk_opf_t opf) + __releases(rcu) + __acquires(rcu) { - pgoff_t idx = sector >> PAGE_SECTORS_SHIFT; - struct page *page; - int ret = 0; - - page = brd_lookup_page(brd, sector); - if (page) - return 0; + gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; + struct page *page, *ret; + rcu_read_unlock(); page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); - if (!page) - return -ENOMEM; + if (!page) { + rcu_read_lock(); + return ERR_PTR(-ENOMEM); + } xa_lock(&brd->brd_pages); - ret = __xa_insert(&brd->brd_pages, idx, page, gfp); - if (!ret) - brd->brd_nr_pages++; - xa_unlock(&brd->brd_pages); - - if (ret < 0) { + ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, + page, gfp); + rcu_read_lock(); + if (ret) { + xa_unlock(&brd->brd_pages); __free_page(page); - if (ret == -EBUSY) - ret = 0; + if (xa_is_err(ret)) + return ERR_PTR(xa_err(ret)); + return ret; } - return ret; + brd->brd_nr_pages++; + xa_unlock(&brd->brd_pages); + return page; } /* @@ -100,143 +103,77 @@ static void brd_free_pages(struct brd_device *brd) } /* - * copy_to_brd_setup must be called before copy_to_brd. It may sleep. + * Process a single segment. The segment is capped to not cross page boundaries + * in both the bio and the brd backing memory. */ -static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n, - gfp_t gfp) -{ - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; - int ret; - - copy = min_t(size_t, n, PAGE_SIZE - offset); - ret = brd_insert_page(brd, sector, gfp); - if (ret) - return ret; - if (copy < n) { - sector += copy >> SECTOR_SHIFT; - ret = brd_insert_page(brd, sector, gfp); - } - return ret; -} - -/* - * Copy n bytes from src to the brd starting at sector. Does not sleep. - */ -static void copy_to_brd(struct brd_device *brd, const void *src, - sector_t sector, size_t n) +static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) { + struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); + sector_t sector = bio->bi_iter.bi_sector; + u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; + blk_opf_t opf = bio->bi_opf; struct page *page; - void *dst; - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; + void *kaddr; - copy = min_t(size_t, n, PAGE_SIZE - offset); - page = brd_lookup_page(brd, sector); - BUG_ON(!page); - - dst = kmap_atomic(page); - memcpy(dst + offset, src, copy); - kunmap_atomic(dst); - - if (copy < n) { - src += copy; - sector += copy >> SECTOR_SHIFT; - copy = n - copy; - page = brd_lookup_page(brd, sector); - BUG_ON(!page); - - dst = kmap_atomic(page); - memcpy(dst, src, copy); - kunmap_atomic(dst); - } -} - -/* - * Copy n bytes to dst from the brd starting at sector. Does not sleep. - */ -static void copy_from_brd(void *dst, struct brd_device *brd, - sector_t sector, size_t n) -{ - struct page *page; - void *src; - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; + bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); - copy = min_t(size_t, n, PAGE_SIZE - offset); + rcu_read_lock(); page = brd_lookup_page(brd, sector); - if (page) { - src = kmap_atomic(page); - memcpy(dst, src + offset, copy); - kunmap_atomic(src); - } else - memset(dst, 0, copy); - - if (copy < n) { - dst += copy; - sector += copy >> SECTOR_SHIFT; - copy = n - copy; - page = brd_lookup_page(brd, sector); - if (page) { - src = kmap_atomic(page); - memcpy(dst, src, copy); - kunmap_atomic(src); - } else - memset(dst, 0, copy); + if (!page && op_is_write(opf)) { + page = brd_insert_page(brd, sector, opf); + if (IS_ERR(page)) + goto out_error; } -} - -/* - * Process a single bvec of a bio. - */ -static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, blk_opf_t opf, - sector_t sector) -{ - void *mem; - int err = 0; + kaddr = bvec_kmap_local(&bv); if (op_is_write(opf)) { - /* - * Must use NOIO because we don't want to recurse back into the - * block or filesystem layers from page reclaim. - */ - gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO; - - err = copy_to_brd_setup(brd, sector, len, gfp); - if (err) - goto out; - } - - mem = kmap_atomic(page); - if (!op_is_write(opf)) { - copy_from_brd(mem + off, brd, sector, len); - flush_dcache_page(page); + memcpy_to_page(page, offset, kaddr, bv.bv_len); } else { - flush_dcache_page(page); - copy_to_brd(brd, mem + off, sector, len); + if (page) + memcpy_from_page(kaddr, page, offset, bv.bv_len); + else + memset(kaddr, 0, bv.bv_len); } - kunmap_atomic(mem); + kunmap_local(kaddr); + rcu_read_unlock(); + + bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); + return true; + +out_error: + rcu_read_unlock(); + if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + return false; +} -out: - return err; +static void brd_free_one_page(struct rcu_head *head) +{ + struct page *page = container_of(head, struct page, rcu_head); + + __free_page(page); } static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { - sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS; + sector_t aligned_sector = round_up(sector, PAGE_SECTORS); + sector_t aligned_end = round_down( + sector + (size >> SECTOR_SHIFT), PAGE_SECTORS); struct page *page; - size -= (aligned_sector - sector) * SECTOR_SIZE; + if (aligned_end <= aligned_sector) + return; + xa_lock(&brd->brd_pages); - while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { + while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { - __free_page(page); + call_rcu(&page->rcu_head, brd_free_one_page); brd->brd_nr_pages--; } aligned_sector += PAGE_SECTORS; - size -= PAGE_SIZE; } xa_unlock(&brd->brd_pages); } @@ -244,36 +181,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) static void brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; - sector_t sector = bio->bi_iter.bi_sector; - struct bio_vec bvec; - struct bvec_iter iter; if (unlikely(op_is_discard(bio->bi_opf))) { - brd_do_discard(brd, sector, bio->bi_iter.bi_size); + brd_do_discard(brd, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); bio_endio(bio); return; } - bio_for_each_segment(bvec, bio, iter) { - unsigned int len = bvec.bv_len; - int err; - - /* Don't support un-aligned buffer */ - WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || - (len & (SECTOR_SIZE - 1))); - - err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, - bio->bi_opf, sector); - if (err) { - if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) { - bio_wouldblock_error(bio); - return; - } - bio_io_error(bio); + do { + if (!brd_rw_bvec(brd, bio)) return; - } - sector += len >> SECTOR_SHIFT; - } + } while (bio->bi_iter.bi_size); bio_endio(bio); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e21492981f7d..f6d6276974ee 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -380,6 +380,9 @@ enum { /* this is/was a write request */ __EE_WRITE, + /* hand back using mempool_free(e, drbd_buffer_page_pool) */ + __EE_RELEASE_TO_MEMPOOL, + /* this is/was a write same request */ __EE_WRITE_SAME, @@ -402,6 +405,7 @@ enum { #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) #define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_WRITE (1<<__EE_WRITE) +#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL) #define EE_WRITE_SAME (1<<__EE_WRITE_SAME) #define EE_APPLICATION (1<<__EE_APPLICATION) #define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ) @@ -858,7 +862,6 @@ struct drbd_device { struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* need to send P_WRITE_ACK */ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ - struct list_head net_ee; /* zero-copy network send in progress */ struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ @@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t drbd_request_mempool; extern mempool_t drbd_ee_mempool; -/* drbd's page pool, used to buffer data received from the peer, - * or data requested by the peer. - * - * This does not have an emergency reserve. - * - * When allocating from this pool, it first takes pages from the pool. - * Only if the pool is depleted will try to allocate from the system. - * - * The assumption is that pages taken from this pool will be processed, - * and given back, "quickly", and then can be recycled, so we can avoid - * frequent calls to alloc_page(), and still will be able to make progress even - * under memory pressure. - */ -extern struct page *drbd_pp_pool; -extern spinlock_t drbd_pp_lock; -extern int drbd_pp_vacant; -extern wait_queue_head_t drbd_pp_wait; - /* We also need a standard (emergency-reserve backed) page pool * for meta data IO (activity log, bitmap). * We can keep it global, as long as it is used as "N pages at a time". @@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait; */ #define DRBD_MIN_POOL_PAGES 128 extern mempool_t drbd_md_io_page_pool; +extern mempool_t drbd_buffer_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ @@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, sector_t, unsigned int, unsigned int, gfp_t) __must_hold(local); -extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, - int); -#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) -#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) +extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req); extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); @@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page) for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) -{ - struct page *page = peer_req->pages; - page_chain_for_each(page) { - if (page_count(page) > 1) - return 1; - } - return 0; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ced2cc5f46f2..c73376886e7a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t drbd_request_mempool; mempool_t drbd_ee_mempool; mempool_t drbd_md_io_page_pool; +mempool_t drbd_buffer_page_pool; struct bio_set drbd_md_io_bio_set; struct bio_set drbd_io_bio_set; -/* I do not use a standard mempool, because: - 1) I want to hand out the pre-allocated objects first. - 2) I want to be able to interrupt sleeping allocation with a signal. - Note: This is a single linked list, the next pointer is the private - member of struct page. - */ -struct page *drbd_pp_pool; -DEFINE_SPINLOCK(drbd_pp_lock); -int drbd_pp_vacant; -wait_queue_head_t drbd_pp_wait; - DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); static const struct block_device_operations drbd_ops = { @@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, struct drbd_peer_request *peer_req) { + bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL); struct page *page = peer_req->pages; unsigned len = peer_req->i.size; int err; @@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, page_chain_for_each(page) { unsigned l = min_t(unsigned, len, PAGE_SIZE); - err = _drbd_send_page(peer_device, page, 0, l, - page_chain_next(page) ? MSG_MORE : 0); + if (likely(use_sendpage)) + err = _drbd_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + else + err = _drbd_no_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + if (err) return err; len -= l; @@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device) INIT_LIST_HEAD(&device->sync_ee); INIT_LIST_HEAD(&device->done_ee); INIT_LIST_HEAD(&device->read_ee); - INIT_LIST_HEAD(&device->net_ee); INIT_LIST_HEAD(&device->resync_reads); INIT_LIST_HEAD(&device->resync_work.list); INIT_LIST_HEAD(&device->unplug_work.list); @@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device) D_ASSERT(device, list_empty(&device->sync_ee)); D_ASSERT(device, list_empty(&device->done_ee)); D_ASSERT(device, list_empty(&device->read_ee)); - D_ASSERT(device, list_empty(&device->net_ee)); D_ASSERT(device, list_empty(&device->resync_reads)); D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); D_ASSERT(device, list_empty(&device->resync_work.list)); @@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device) static void drbd_destroy_mempools(void) { - struct page *page; - - while (drbd_pp_pool) { - page = drbd_pp_pool; - drbd_pp_pool = (struct page *)page_private(page); - __free_page(page); - drbd_pp_vacant--; - } - /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ bioset_exit(&drbd_io_bio_set); bioset_exit(&drbd_md_io_bio_set); + mempool_exit(&drbd_buffer_page_pool); mempool_exit(&drbd_md_io_page_pool); mempool_exit(&drbd_ee_mempool); mempool_exit(&drbd_request_mempool); @@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void) static int drbd_create_mempools(void) { - struct page *page; const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count; - int i, ret; + int ret; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void) if (ret) goto Enomem; + ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0); + if (ret) + goto Enomem; + ret = mempool_init_slab_pool(&drbd_request_mempool, number, drbd_request_cache); if (ret) @@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void) if (ret) goto Enomem; - for (i = 0; i < number; i++) { - page = alloc_page(GFP_HIGHUSER); - if (!page) - goto Enomem; - set_page_private(page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = page; - } - drbd_pp_vacant = number; - return 0; Enomem: @@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device) rr = drbd_free_peer_reqs(device, &device->done_ee); if (rr) drbd_err(device, "%d EEs in done list found!\n", rr); - - rr = drbd_free_peer_reqs(device, &device->net_ee); - if (rr) - drbd_err(device, "%d EEs in net list found!\n", rr); } /* caution. no locking. */ @@ -2863,11 +2839,6 @@ static int __init drbd_init(void) return err; } - /* - * allocate all necessary structs - */ - init_waitqueue_head(&drbd_pp_wait); - drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); @@ -3591,7 +3562,8 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) static void md_sync_timer_fn(struct timer_list *t) { - struct drbd_device *device = from_timer(device, t, md_sync_timer); + struct drbd_device *device = timer_container_of(device, t, + md_sync_timer); drbd_device_post_work(device, MD_SYNC); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e5a2e5f7887b..caaf2781136d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/scatterlist.h> #include <linux/part_stat.h> +#include <linux/mempool.h> #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" @@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int); #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) -/* - * some helper functions to deal with single linked page lists, - * page->private being our "next" pointer. - */ - -/* If at least n pages are linked at head, get n pages off. - * Otherwise, don't modify head, and return NULL. - * Locking is the responsibility of the caller. - */ -static struct page *page_chain_del(struct page **head, int n) -{ - struct page *page; - struct page *tmp; - - BUG_ON(!n); - BUG_ON(!head); - - page = *head; - - if (!page) - return NULL; - - while (page) { - tmp = page_chain_next(page); - if (--n == 0) - break; /* found sufficient pages */ - if (tmp == NULL) - /* insufficient pages, don't use any of them. */ - return NULL; - page = tmp; - } - - /* add end of list marker for the returned list */ - set_page_private(page, 0); - /* actual return value, and adjustment of head */ - page = *head; - *head = tmp; - return page; -} - -/* may be used outside of locks to find the tail of a (usually short) - * "private" page chain, before adding it back to a global chain head - * with page_chain_add() under a spinlock. */ -static struct page *page_chain_tail(struct page *page, int *len) -{ - struct page *tmp; - int i = 1; - while ((tmp = page_chain_next(page))) { - ++i; - page = tmp; - } - if (len) - *len = i; - return page; -} - -static int page_chain_free(struct page *page) -{ - struct page *tmp; - int i = 0; - page_chain_for_each_safe(page, tmp) { - put_page(page); - ++i; - } - return i; -} - -static void page_chain_add(struct page **head, - struct page *chain_first, struct page *chain_last) -{ -#if 1 - struct page *tmp; - tmp = page_chain_tail(chain_first, NULL); - BUG_ON(tmp != chain_last); -#endif - - /* add chain to head */ - set_page_private(chain_last, (unsigned long)*head); - *head = chain_first; -} - -static struct page *__drbd_alloc_pages(struct drbd_device *device, - unsigned int number) +static struct page *__drbd_alloc_pages(unsigned int number) { struct page *page = NULL; struct page *tmp = NULL; unsigned int i = 0; - /* Yes, testing drbd_pp_vacant outside the lock is racy. - * So what. It saves a spin_lock. */ - if (drbd_pp_vacant >= number) { - spin_lock(&drbd_pp_lock); - page = page_chain_del(&drbd_pp_pool, number); - if (page) - drbd_pp_vacant -= number; - spin_unlock(&drbd_pp_lock); - if (page) - return page; - } - /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ for (i = 0; i < number; i++) { - tmp = alloc_page(GFP_TRY); + tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY); if (!tmp) - break; + goto fail; set_page_private(tmp, (unsigned long)page); page = tmp; } - - if (i == number) - return page; - - /* Not enough pages immediately available this time. - * No need to jump around here, drbd_alloc_pages will retry this - * function "soon". */ - if (page) { - tmp = page_chain_tail(page, NULL); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); + return page; +fail: + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + mempool_free(page, &drbd_buffer_page_pool); } return NULL; } -static void reclaim_finished_net_peer_reqs(struct drbd_device *device, - struct list_head *to_be_freed) -{ - struct drbd_peer_request *peer_req, *tmp; - - /* The EEs are always appended to the end of the list. Since - they are sent in order over the wire, they have to finish - in order. As soon as we see the first not finished we can - stop to examine the list... */ - - list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { - if (drbd_peer_req_has_active_page(peer_req)) - break; - list_move(&peer_req->w.list, to_be_freed); - } -} - -static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) -{ - LIST_HEAD(reclaimed); - struct drbd_peer_request *peer_req, *t; - - spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); - spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); -} - -static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - if (!atomic_read(&device->pp_in_use_by_net)) - continue; - - kref_get(&device->kref); - rcu_read_unlock(); - drbd_reclaim_net_peer_reqs(device); - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - rcu_read_unlock(); -} - /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @peer_device: DRBD device. @@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int bool retry) { struct drbd_device *device = peer_device->device; - struct page *page = NULL; + struct page *page; struct net_conf *nc; - DEFINE_WAIT(wait); unsigned int mxb; rcu_read_lock(); @@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int mxb = nc ? nc->max_buffers : 1000000; rcu_read_unlock(); - if (atomic_read(&device->pp_in_use) < mxb) - page = __drbd_alloc_pages(device, number); - - /* Try to keep the fast path fast, but occasionally we need - * to reclaim the pages we lended to the network stack. */ - if (page && atomic_read(&device->pp_in_use_by_net) > 512) - drbd_reclaim_net_peer_reqs(device); - - while (page == NULL) { - prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - - drbd_reclaim_net_peer_reqs(device); - - if (atomic_read(&device->pp_in_use) < mxb) { - page = __drbd_alloc_pages(device, number); - if (page) - break; - } - - if (!retry) - break; - - if (signal_pending(current)) { - drbd_warn(device, "drbd_alloc_pages interrupted!\n"); - break; - } - - if (schedule_timeout(HZ/10) == 0) - mxb = UINT_MAX; - } - finish_wait(&drbd_pp_wait, &wait); + if (atomic_read(&device->pp_in_use) >= mxb) + schedule_timeout_interruptible(HZ / 10); + page = __drbd_alloc_pages(number); if (page) atomic_add(number, &device->pp_in_use); @@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int * Is also used from inside an other spin_lock_irq(&resource->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) +static void drbd_free_pages(struct drbd_device *device, struct page *page) { - atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; - int i; + struct page *tmp; + int i = 0; if (page == NULL) return; - if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count) - i = page_chain_free(page); - else { - struct page *tmp; - tmp = page_chain_tail(page, &i); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); - } - i = atomic_sub_return(i, a); + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + if (page_count(page) == 1) + mempool_free(page, &drbd_buffer_page_pool); + else + put_page(page); + i++; + } + i = atomic_sub_return(i, &device->pp_in_use); if (i < 0) - drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", - is_net ? "pp_in_use_by_net" : "pp_in_use", i); - wake_up(&drbd_pp_wait); + drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); } /* @@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; + if (!mempool_is_saturated(&drbd_buffer_page_pool)) + peer_req->flags |= EE_RELEASE_TO_MEMPOOL; } memset(peer_req, 0, sizeof(*peer_req)); @@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } -void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, - int is_net) +void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req) { might_sleep(); if (peer_req->flags & EE_HAS_DIGEST) kfree(peer_req->digest); - drbd_free_pages(device, peer_req->pages, is_net); + drbd_free_pages(device, peer_req->pages); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { @@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) LIST_HEAD(work_list); struct drbd_peer_request *peer_req, *t; int count = 0; - int is_net = list == &device->net_ee; spin_lock_irq(&device->resource->req_lock); list_splice_init(list, &work_list); spin_unlock_irq(&device->resource->req_lock); list_for_each_entry_safe(peer_req, t, &work_list, w.list) { - __drbd_free_peer_req(device, peer_req, is_net); + drbd_free_peer_req(device, peer_req); count++; } return count; @@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) static int drbd_finish_peer_reqs(struct drbd_device *device) { LIST_HEAD(work_list); - LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; int err = 0; spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); list_splice_init(&device->done_ee, &work_list); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); - /* possible callbacks here: * e_end_block, and e_end_resync_block, e_send_superseded. * all ignore the last argument. @@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) data_size -= len; } kunmap(page); - drbd_free_pages(peer_device->device, page, 0); + drbd_free_pages(peer_device->device, page); return err; } @@ -2500,7 +2312,11 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); + /* put is in drbd_send_acks_wf() */ + kref_get(&device->kref); + if (!queue_work(connection->ack_sender, + &peer_req->peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); err = -ENOENT; goto out; @@ -5220,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) put_ldev(device); } - /* tcp_close and release of sendpage pages can be deferred. I don't - * want to use SO_LINGER, because apparently it can be deferred for - * more than 20 seconds (longest time I checked). - * - * Actually we don't care for exactly when the network stack does its - * put_page(), but release our reference on these pages right here. - */ - i = drbd_free_peer_reqs(device, &device->net_ee); - if (i) - drbd_info(device, "net_ee not empty, killed %u entries\n", i); i = atomic_read(&device->pp_in_use_by_net); if (i) drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); @@ -5976,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi) while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - conn_reclaim_net_peer_reqs(connection); - if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 380e6584a4ee..d15826f6ee81 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1699,7 +1699,8 @@ static bool net_timeout_reached(struct drbd_request *net_req, void request_timer_fn(struct timer_list *t) { - struct drbd_device *device = from_timer(device, t, request_timer); + struct drbd_device *device = timer_container_of(device, t, + request_timer); struct drbd_connection *connection = first_peer_device(device)->connection; struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ struct net_conf *nc; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4352a50fbb3f..dea3e79d044f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -442,7 +442,8 @@ int w_resync_timer(struct drbd_work *w, int cancel) void resync_timer_fn(struct timer_list *t) { - struct drbd_device *device = from_timer(device, t, resync_timer); + struct drbd_device *device = timer_container_of(device, t, + resync_timer); drbd_queue_work_if_unqueued( &first_peer_device(device)->connection->sender_work, @@ -1029,22 +1030,6 @@ out: return 1; } -/* helper */ -static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req) -{ - if (drbd_peer_req_has_active_page(peer_req)) { - /* This might happen if sendpage() has not finished */ - int i = PFN_UP(peer_req->i.size); - atomic_add(i, &device->pp_in_use_by_net); - atomic_sub(i, &device->pp_in_use); - spin_lock_irq(&device->resource->req_lock); - list_add_tail(&peer_req->w.list, &device->net_ee); - spin_unlock_irq(&device->resource->req_lock); - wake_up(&drbd_pp_wait); - } else - drbd_free_peer_req(device, peer_req); -} - /** * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST * @w: work object. @@ -1058,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { @@ -1073,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req); } - dec_unacked(device); - - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } @@ -1119,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (get_ldev_if_state(device, D_FAILED)) { @@ -1154,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) /* update resync data with failure */ drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size); } - - dec_unacked(device); - - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } @@ -1175,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) int err, eq = 0; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (get_ldev(device)) { @@ -1219,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) if (drbd_ratelimit()) drbd_err(device, "Sending NegDReply. I guess it gets messy.\n"); } - - dec_unacked(device); - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block/ack() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } @@ -1698,7 +1679,8 @@ void drbd_rs_controller_reset(struct drbd_peer_device *peer_device) void start_resync_timer_fn(struct timer_list *t) { - struct drbd_device *device = from_timer(device, t, start_resync_timer); + struct drbd_device *device = timer_container_of(device, t, + start_resync_timer); drbd_device_post_work(device, RS_START); } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index e97432032f01..24be0c2c4075 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3411,7 +3411,7 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, struct floppy_max_errors max_errors; struct floppy_drive_params dp; } inparam; /* parameters coming from user space */ - const void *outparam; /* parameters passed back to user space */ + const void *outparam = NULL; /* parameters passed back to user space */ /* convert compatibility eject ioctls into floppy eject ioctl. * We do this in order to provide a means to eject floppy disks before diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e2b1f377f585..1b6ee91f8eb9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -313,6 +313,8 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) return; kfree(cmd->bvec); cmd->bvec = NULL; + if (req_op(rq) == REQ_OP_WRITE) + kiocb_end_write(&cmd->iocb); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } @@ -387,9 +389,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_flags = 0; } - if (rw == ITER_SOURCE) + if (rw == ITER_SOURCE) { + kiocb_start_write(&cmd->iocb); ret = file->f_op->write_iter(&cmd->iocb, &iter); - else + } else ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); @@ -1244,12 +1247,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); - if (size_changed) { - loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, - lo->lo_backing_file); - loop_set_size(lo, new_size); - } - /* update the direct I/O flag if lo_offset changed */ loop_update_dio(lo); @@ -1257,6 +1254,11 @@ out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue, memflags); if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + if (!err && size_changed) { + loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, + lo->lo_backing_file); + loop_set_size(lo, new_size); + } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) @@ -1429,17 +1431,34 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) return 0; } -static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, unsigned long arg) { struct queue_limits lim; unsigned int memflags; int err = 0; - if (lo->lo_state != Lo_bound) - return -ENXIO; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); + if (err) + return err; + } + + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + goto abort_claim; + + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto unlock; + } if (lo->lo_queue->limits.logical_block_size == arg) - return 0; + goto unlock; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); @@ -1452,6 +1471,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); +unlock: + mutex_unlock(&lo->lo_mutex); +abort_claim: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_block_size); return err; } @@ -1470,9 +1494,6 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; - case LOOP_SET_BLOCK_SIZE: - err = loop_set_block_size(lo, arg); - break; default: err = -EINVAL; } @@ -1527,9 +1548,12 @@ static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); + case LOOP_SET_BLOCK_SIZE: + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: - case LOOP_SET_BLOCK_SIZE: if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 0d619df03fa9..8fc7761397bd 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2040,11 +2040,12 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * @dir Direction (read or write) * * return value - * None + * 0 The IO completed successfully. + * -ENOMEM The DMA mapping failed. */ -static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, - struct mtip_cmd *command, - struct blk_mq_hw_ctx *hctx) +static int mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, + struct blk_mq_hw_ctx *hctx) { struct mtip_cmd_hdr *hdr = dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag; @@ -2056,12 +2057,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(rq, command->sg); - nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + command->scatter_ents = blk_rq_map_sg(rq, command->sg); + nents = dma_map_sg(&dd->pdev->dev, command->sg, + command->scatter_ents, dma_dir); + if (!nents) + return -ENOMEM; - prefetch(&port->flags); - command->scatter_ents = nents; + prefetch(&port->flags); /* * The number of retries for this command before it is @@ -2112,11 +2115,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); - return; + return 0; } /* Issue the command to the hardware */ mtip_issue_ncq_command(port, rq->tag); + + return 0; } /* @@ -3315,7 +3320,9 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - mtip_hw_submit_io(dd, rq, cmd, hctx); + if (mtip_hw_submit_io(dd, rq, cmd, hctx)) + return BLK_STS_IOERR; + return BLK_STS_OK; } @@ -3717,7 +3724,7 @@ static int mtip_pci_probe(struct pci_dev *pdev, rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (rv) { dev_warn(&pdev->dev, "64-bit DMA enable failed\n"); - goto setmask_err; + goto iomap_err; } /* Copy the info we may need later into the private data structure. */ @@ -3733,7 +3740,7 @@ static int mtip_pci_probe(struct pci_dev *pdev, if (!dd->isr_workq) { dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); rv = -ENOMEM; - goto setmask_err; + goto iomap_err; } memset(cpu_list, 0, sizeof(cpu_list)); @@ -3830,8 +3837,6 @@ msi_initialize_err: drop_cpu(dd->work[1].cpu_binding); drop_cpu(dd->work[2].cpu_binding); } -setmask_err: - pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); iomap_err: kfree(dd); @@ -3907,7 +3912,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) pci_disable_msi(pdev); - pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); put_disk(dd->disk); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7bdc7eb808ea..6463d0e8d0ce 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1473,7 +1473,17 @@ static int nbd_start_device(struct nbd_device *nbd) return -EINVAL; } - blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); +retry: + mutex_unlock(&nbd->config_lock); + blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections); + mutex_lock(&nbd->config_lock); + + /* if another code path updated nr_hw_queues, retry until succeed */ + if (num_connections != config->num_connections) { + num_connections = config->num_connections; + goto retry; + } + nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); @@ -2198,9 +2208,7 @@ again: goto out; } } - ret = nbd_start_device(nbd); - if (ret) - goto out; + if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) { nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER], GFP_KERNEL); @@ -2216,6 +2224,8 @@ again: goto out; } set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags); + + ret = nbd_start_device(nbd); out: mutex_unlock(&nbd->config_lock); if (!ret) { diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index aa163ae9b2aa..91642c9a3b29 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1179,7 +1179,7 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, memcpy_page(dest, off + count, t_page->page, offset, temp); else - zero_user(dest, off + count, temp); + memzero_page(dest, off + count, temp); count += temp; sector += temp >> SECTOR_SHIFT; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c deleted file mode 100644 index 65b96c083b3c..000000000000 --- a/drivers/block/pktcdvd.c +++ /dev/null @@ -1,2916 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe <axboe@suse.de> - * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com> - * Copyright (C) 2006 Thomas Maier <balagi@justmail.de> - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and - * DVD-RAM devices. - * - * Theory of operation: - * - * At the lowest level, there is the standard driver for the CD/DVD device, - * such as drivers/scsi/sr.c. This driver can handle read and write requests, - * but it doesn't know anything about the special restrictions that apply to - * packet writing. One restriction is that write requests must be aligned to - * packet boundaries on the physical media, and the size of a write request - * must be equal to the packet size. Another restriction is that a - * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read - * command, if the previous command was a write. - * - * The purpose of the packet writing driver is to hide these restrictions from - * higher layers, such as file systems, and present a block device that can be - * randomly read and written using 2kB-sized blocks. - * - * The lowest layer in the packet writing driver is the packet I/O scheduler. - * Its data is defined by the struct packet_iosched and includes two bio - * queues with pending read and write requests. These queues are processed - * by the pkt_iosched_process_queue() function. The write requests in this - * queue are already properly aligned and sized. This layer is responsible for - * issuing the flush cache commands and scheduling the I/O in a good order. - * - * The next layer transforms unaligned write requests to aligned writes. This - * transformation requires reading missing pieces of data from the underlying - * block device, assembling the pieces to full packets and queuing them to the - * packet I/O scheduler. - * - * At the top layer there is a custom ->submit_bio function that forwards - * read requests directly to the iosched queue and puts write requests in the - * unaligned write queue. A kernel thread performs the necessary read - * gathering to convert the unaligned writes to aligned writes and then feeds - * them to the packet I/O scheduler. - * - *************************************************************************/ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/backing-dev.h> -#include <linux/compat.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/freezer.h> -#include <linux/kernel.h> -#include <linux/kthread.h> -#include <linux/miscdevice.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/nospec.h> -#include <linux/pktcdvd.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -#include <scsi/scsi.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_ioctl.h> - -#include <linux/unaligned.h> - -#define DRIVER_NAME "pktcdvd" - -#define MAX_SPEED 0xffff - -static DEFINE_MUTEX(pktcdvd_mutex); -static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; -static struct proc_dir_entry *pkt_proc; -static int pktdev_major; -static int write_congestion_on = PKT_WRITE_CONGESTION_ON; -static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; -static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ -static mempool_t psd_pool; -static struct bio_set pkt_bio_set; - -/* /sys/class/pktcdvd */ -static struct class class_pktcdvd; -static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ - -/* forward declaration */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); -static int pkt_remove_dev(dev_t pkt_dev); - -static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) -{ - return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); -} - -/********************************************************** - * sysfs interface for pktcdvd - * by (C) 2006 Thomas Maier <balagi@justmail.de> - - /sys/class/pktcdvd/pktcdvd[0-7]/ - stat/reset - stat/packets_started - stat/packets_finished - stat/kb_written - stat/kb_read - stat/kb_read_gather - write_queue/size - write_queue/congestion_off - write_queue/congestion_on - **********************************************************/ - -static ssize_t packets_started_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); -} -static DEVICE_ATTR_RO(packets_started); - -static ssize_t packets_finished_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); -} -static DEVICE_ATTR_RO(packets_finished); - -static ssize_t kb_written_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); -} -static DEVICE_ATTR_RO(kb_written); - -static ssize_t kb_read_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); -} -static DEVICE_ATTR_RO(kb_read); - -static ssize_t kb_read_gather_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); -} -static DEVICE_ATTR_RO(kb_read_gather); - -static ssize_t reset_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - if (len > 0) { - pd->stats.pkt_started = 0; - pd->stats.pkt_ended = 0; - pd->stats.secs_w = 0; - pd->stats.secs_rg = 0; - pd->stats.secs_r = 0; - } - return len; -} -static DEVICE_ATTR_WO(reset); - -static struct attribute *pkt_stat_attrs[] = { - &dev_attr_packets_finished.attr, - &dev_attr_packets_started.attr, - &dev_attr_kb_read.attr, - &dev_attr_kb_written.attr, - &dev_attr_kb_read_gather.attr, - &dev_attr_reset.attr, - NULL, -}; - -static const struct attribute_group pkt_stat_group = { - .name = "stat", - .attrs = pkt_stat_attrs, -}; - -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); - spin_unlock(&pd->lock); - return n; -} -static DEVICE_ATTR_RO(size); - -static void init_write_congestion_marks(int* lo, int* hi) -{ - if (*hi > 0) { - *hi = max(*hi, 500); - *hi = min(*hi, 1000000); - if (*lo <= 0) - *lo = *hi - 100; - else { - *lo = min(*lo, *hi - 100); - *lo = max(*lo, 100); - } - } else { - *hi = -1; - *lo = -1; - } -} - -static ssize_t congestion_off_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_off_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val, ret; - - ret = kstrtoint(buf, 10, &val); - if (ret) - return ret; - - spin_lock(&pd->lock); - pd->write_congestion_off = val; - init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); - spin_unlock(&pd->lock); - return len; -} -static DEVICE_ATTR_RW(congestion_off); - -static ssize_t congestion_on_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_on_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val, ret; - - ret = kstrtoint(buf, 10, &val); - if (ret) - return ret; - - spin_lock(&pd->lock); - pd->write_congestion_on = val; - init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); - spin_unlock(&pd->lock); - return len; -} -static DEVICE_ATTR_RW(congestion_on); - -static struct attribute *pkt_wq_attrs[] = { - &dev_attr_congestion_on.attr, - &dev_attr_congestion_off.attr, - &dev_attr_size.attr, - NULL, -}; - -static const struct attribute_group pkt_wq_group = { - .name = "write_queue", - .attrs = pkt_wq_attrs, -}; - -static const struct attribute_group *pkt_groups[] = { - &pkt_stat_group, - &pkt_wq_group, - NULL, -}; - -static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) -{ - if (class_is_registered(&class_pktcdvd)) { - pd->dev = device_create_with_groups(&class_pktcdvd, NULL, - MKDEV(0, 0), pd, pkt_groups, - "%s", pd->disk->disk_name); - if (IS_ERR(pd->dev)) - pd->dev = NULL; - } -} - -static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) -{ - if (class_is_registered(&class_pktcdvd)) - device_unregister(pd->dev); -} - - -/******************************************************************** - /sys/class/pktcdvd/ - add map block device - remove unmap packet dev - device_map show mappings - *******************************************************************/ - -static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr, - char *data) -{ - int n = 0; - int idx; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - for (idx = 0; idx < MAX_WRITERS; idx++) { - struct pktcdvd_device *pd = pkt_devs[idx]; - if (!pd) - continue; - n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n", - pd->disk->disk_name, - MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), - MAJOR(file_bdev(pd->bdev_file)->bd_dev), - MINOR(file_bdev(pd->bdev_file)->bd_dev)); - } - mutex_unlock(&ctl_mutex); - return n; -} -static CLASS_ATTR_RO(device_map); - -static ssize_t add_store(const struct class *c, const struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - /* pkt_setup_dev() expects caller to hold reference to self */ - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - pkt_setup_dev(MKDEV(major, minor), NULL); - - module_put(THIS_MODULE); - - return count; - } - - return -EINVAL; -} -static CLASS_ATTR_WO(add); - -static ssize_t remove_store(const struct class *c, const struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - pkt_remove_dev(MKDEV(major, minor)); - return count; - } - return -EINVAL; -} -static CLASS_ATTR_WO(remove); - -static struct attribute *class_pktcdvd_attrs[] = { - &class_attr_add.attr, - &class_attr_remove.attr, - &class_attr_device_map.attr, - NULL, -}; -ATTRIBUTE_GROUPS(class_pktcdvd); - -static struct class class_pktcdvd = { - .name = DRIVER_NAME, - .class_groups = class_pktcdvd_groups, -}; - -static int pkt_sysfs_init(void) -{ - /* - * create control files in sysfs - * /sys/class/pktcdvd/... - */ - return class_register(&class_pktcdvd); -} - -static void pkt_sysfs_cleanup(void) -{ - class_unregister(&class_pktcdvd); -} - -/******************************************************************** - entries in debugfs - - /sys/kernel/debug/pktcdvd[0-7]/ - info - - *******************************************************************/ - -static void pkt_count_states(struct pktcdvd_device *pd, int *states) -{ - struct packet_data *pkt; - int i; - - for (i = 0; i < PACKET_NUM_STATES; i++) - states[i] = 0; - - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - states[pkt->state]++; - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -static int pkt_seq_show(struct seq_file *m, void *p) -{ - struct pktcdvd_device *pd = m->private; - char *msg; - int states[PACKET_NUM_STATES]; - - seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, - file_bdev(pd->bdev_file)); - - seq_printf(m, "\nSettings:\n"); - seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); - - if (pd->settings.write_type == 0) - msg = "Packet"; - else - msg = "Unknown"; - seq_printf(m, "\twrite type:\t\t%s\n", msg); - - seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); - seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); - - seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); - - if (pd->settings.block_mode == PACKET_BLOCK_MODE1) - msg = "Mode 1"; - else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) - msg = "Mode 2"; - else - msg = "Unknown"; - seq_printf(m, "\tblock mode:\t\t%s\n", msg); - - seq_printf(m, "\nStatistics:\n"); - seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); - seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); - seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); - seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); - seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); - - seq_printf(m, "\nMisc:\n"); - seq_printf(m, "\treference count:\t%d\n", pd->refcnt); - seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); - seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); - seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); - seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); - seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); - - seq_printf(m, "\nQueue state:\n"); - seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); - seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); - seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", pd->current_sector); - - pkt_count_states(pd, states); - seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", - pd->write_congestion_off, - pd->write_congestion_on); - return 0; -} -DEFINE_SHOW_ATTRIBUTE(pkt_seq); - -static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - - pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, - pd, &pkt_seq_fops); -} - -static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - debugfs_remove(pd->dfs_f_info); - debugfs_remove(pd->dfs_d_root); - pd->dfs_f_info = NULL; - pd->dfs_d_root = NULL; -} - -static void pkt_debugfs_init(void) -{ - pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); -} - -static void pkt_debugfs_cleanup(void) -{ - debugfs_remove(pkt_debugfs_root); - pkt_debugfs_root = NULL; -} - -/* ----------------------------------------------------------*/ - - -static void pkt_bio_finished(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - - BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); - if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - dev_dbg(ddev, "queue empty\n"); - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); - } -} - -/* - * Allocate a packet_data struct - */ -static struct packet_data *pkt_alloc_packet_data(int frames) -{ - int i; - struct packet_data *pkt; - - pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL); - if (!pkt) - goto no_pkt; - - pkt->frames = frames; - pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); - if (!pkt->w_bio) - goto no_bio; - - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { - pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!pkt->pages[i]) - goto no_page; - } - - spin_lock_init(&pkt->lock); - bio_list_init(&pkt->orig_bios); - - for (i = 0; i < frames; i++) { - pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); - if (!pkt->r_bios[i]) - goto no_rd_bio; - } - - return pkt; - -no_rd_bio: - for (i = 0; i < frames; i++) - kfree(pkt->r_bios[i]); -no_page: - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) - if (pkt->pages[i]) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); -no_bio: - kfree(pkt); -no_pkt: - return NULL; -} - -/* - * Free a packet_data struct - */ -static void pkt_free_packet_data(struct packet_data *pkt) -{ - int i; - - for (i = 0; i < pkt->frames; i++) - kfree(pkt->r_bios[i]); - for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); - kfree(pkt); -} - -static void pkt_shrink_pktlist(struct pktcdvd_device *pd) -{ - struct packet_data *pkt, *next; - - BUG_ON(!list_empty(&pd->cdrw.pkt_active_list)); - - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { - pkt_free_packet_data(pkt); - } - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); -} - -static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) -{ - struct packet_data *pkt; - - BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); - - while (nr_packets > 0) { - pkt = pkt_alloc_packet_data(pd->settings.size >> 2); - if (!pkt) { - pkt_shrink_pktlist(pd); - return 0; - } - pkt->id = nr_packets; - pkt->pd = pd; - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - nr_packets--; - } - return 1; -} - -static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) -{ - struct rb_node *n = rb_next(&node->rb_node); - if (!n) - return NULL; - return rb_entry(n, struct pkt_rb_node, rb_node); -} - -static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - rb_erase(&node->rb_node, &pd->bio_queue); - mempool_free(node, &pd->rb_pool); - pd->bio_queue_size--; - BUG_ON(pd->bio_queue_size < 0); -} - -/* - * Find the first node in the pd->bio_queue rb tree with a starting sector >= s. - */ -static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s) -{ - struct rb_node *n = pd->bio_queue.rb_node; - struct rb_node *next; - struct pkt_rb_node *tmp; - - if (!n) { - BUG_ON(pd->bio_queue_size > 0); - return NULL; - } - - for (;;) { - tmp = rb_entry(n, struct pkt_rb_node, rb_node); - if (s <= tmp->bio->bi_iter.bi_sector) - next = n->rb_left; - else - next = n->rb_right; - if (!next) - break; - n = next; - } - - if (s > tmp->bio->bi_iter.bi_sector) { - tmp = pkt_rbtree_next(tmp); - if (!tmp) - return NULL; - } - BUG_ON(s > tmp->bio->bi_iter.bi_sector); - return tmp; -} - -/* - * Insert a node into the pd->bio_queue rb tree. - */ -static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - struct rb_node **p = &pd->bio_queue.rb_node; - struct rb_node *parent = NULL; - sector_t s = node->bio->bi_iter.bi_sector; - struct pkt_rb_node *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct pkt_rb_node, rb_node); - if (s < tmp->bio->bi_iter.bi_sector) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&node->rb_node, parent, p); - rb_insert_color(&node->rb_node, &pd->bio_queue); - pd->bio_queue_size++; -} - -/* - * Send a packet_command to the underlying block device and - * wait for completion. - */ -static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file)); - struct scsi_cmnd *scmd; - struct request *rq; - int ret = 0; - - rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - scmd = blk_mq_rq_to_pdu(rq); - - if (cgc->buflen) { - ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - GFP_NOIO); - if (ret) - goto out; - } - - scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]); - memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE); - - rq->timeout = 60*HZ; - if (cgc->quiet) - rq->rq_flags |= RQF_QUIET; - - blk_execute_rq(rq, false); - if (scmd->result) - ret = -EIO; -out: - blk_mq_free_request(rq); - return ret; -} - -static const char *sense_key_string(__u8 index) -{ - static const char * const info[] = { - "No sense", "Recovered error", "Not ready", - "Medium error", "Hardware error", "Illegal request", - "Unit attention", "Data protect", "Blank check", - }; - - return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; -} - -/* - * A generic sense dump / resolve mechanism should be implemented across - * all ATAPI + SCSI devices. - */ -static void pkt_dump_sense(struct pktcdvd_device *pd, - struct packet_command *cgc) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct scsi_sense_hdr *sshdr = cgc->sshdr; - - if (sshdr) - dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n", - CDROM_PACKET_SIZE, cgc->cmd, - sshdr->sense_key, sshdr->asc, sshdr->ascq, - sense_key_string(sshdr->sense_key)); - else - dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); -} - -/* - * flush the drive cache to media - */ -static int pkt_flush_cache(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_FLUSH_CACHE; - cgc.quiet = 1; - - /* - * the IMMED bit -- we default to not setting it, although that - * would allow a much faster close, this is safer - */ -#if 0 - cgc.cmd[1] = 1 << 1; -#endif - return pkt_generic_packet(pd, &cgc); -} - -/* - * speed is given as the normal factor, e.g. 4 for 4x - */ -static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, - unsigned write_speed, unsigned read_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_SET_SPEED; - put_unaligned_be16(read_speed, &cgc.cmd[2]); - put_unaligned_be16(write_speed, &cgc.cmd[4]); - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - - return ret; -} - -/* - * Queue a bio for processing by the low-level CD device. Must be called - * from process context. - */ -static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) -{ - /* - * Some CDRW drives can not handle writes larger than one packet, - * even if the size is a multiple of the packet size. - */ - bio->bi_opf |= REQ_NOMERGE; - - spin_lock(&pd->iosched.lock); - if (bio_data_dir(bio) == READ) - bio_list_add(&pd->iosched.read_queue, bio); - else - bio_list_add(&pd->iosched.write_queue, bio); - spin_unlock(&pd->iosched.lock); - - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); -} - -/* - * Process the queued read/write requests. This function handles special - * requirements for CDRW drives: - * - A cache flush command must be inserted before a read request if the - * previous request was a write. - * - Switching between reading and writing is slow, so don't do it more often - * than necessary. - * - Optimize for throughput at the expense of latency. This means that streaming - * writes will never be interrupted by a read, but if the drive has to seek - * before the next write, switch to reading instead if there are any pending - * read requests. - * - Set the read speed according to current usage pattern. When only reading - * from the device, it's best to use the highest possible read speed, but - * when switching often between reading and writing, it's better to have the - * same read and write speeds. - */ -static void pkt_iosched_process_queue(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if (atomic_read(&pd->iosched.attention) == 0) - return; - atomic_set(&pd->iosched.attention, 0); - - for (;;) { - struct bio *bio; - int reads_queued, writes_queued; - - spin_lock(&pd->iosched.lock); - reads_queued = !bio_list_empty(&pd->iosched.read_queue); - writes_queued = !bio_list_empty(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - - if (!reads_queued && !writes_queued) - break; - - if (pd->iosched.writing) { - int need_write_seek = 1; - spin_lock(&pd->iosched.lock); - bio = bio_list_peek(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - if (bio && (bio->bi_iter.bi_sector == - pd->iosched.last_write)) - need_write_seek = 0; - if (need_write_seek && reads_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - dev_dbg(ddev, "write, waiting\n"); - break; - } - pkt_flush_cache(pd); - pd->iosched.writing = 0; - } - } else { - if (!reads_queued && writes_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - dev_dbg(ddev, "read, waiting\n"); - break; - } - pd->iosched.writing = 1; - } - } - - spin_lock(&pd->iosched.lock); - if (pd->iosched.writing) - bio = bio_list_pop(&pd->iosched.write_queue); - else - bio = bio_list_pop(&pd->iosched.read_queue); - spin_unlock(&pd->iosched.lock); - - if (!bio) - continue; - - if (bio_data_dir(bio) == READ) - pd->iosched.successive_reads += - bio->bi_iter.bi_size >> 10; - else { - pd->iosched.successive_reads = 0; - pd->iosched.last_write = bio_end_sector(bio); - } - if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { - if (pd->read_speed == pd->write_speed) { - pd->read_speed = MAX_SPEED; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } else { - if (pd->read_speed != pd->write_speed) { - pd->read_speed = pd->write_speed; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } - - atomic_inc(&pd->cdrw.pending_bios); - submit_bio_noacct(bio); - } -} - -/* - * Special care is needed if the underlying block device has a small - * max_phys_segments value. - */ -static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) { - /* - * The cdrom device can handle one segment/frame - */ - clear_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } - - if ((pd->settings.size << 9) / PAGE_SIZE <= queue_max_segments(q)) { - /* - * We can handle this case at the expense of some extra memory - * copies during write operations - */ - set_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } - - dev_err(ddev, "cdrom max_phys_segments too small\n"); - return -EIO; -} - -static void pkt_end_io_read(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n", - bio, pkt->sector, bio->bi_iter.bi_sector, bio->bi_status); - - if (bio->bi_status) - atomic_inc(&pkt->io_errors); - bio_uninit(bio); - if (atomic_dec_and_test(&pkt->io_wait)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - pkt_bio_finished(pd); -} - -static void pkt_end_io_packet_write(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status); - - pd->stats.pkt_ended++; - - bio_uninit(bio); - pkt_bio_finished(pd); - atomic_dec(&pkt->io_wait); - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); -} - -/* - * Schedule reads for the holes in a packet - */ -static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - int frames_read = 0; - struct bio *bio; - int f; - char written[PACKET_MAX_SIZE]; - - BUG_ON(bio_list_empty(&pkt->orig_bios)); - - atomic_set(&pkt->io_wait, 0); - atomic_set(&pkt->io_errors, 0); - - /* - * Figure out which frames we need to read before we can write. - */ - memset(written, 0, sizeof(written)); - spin_lock(&pkt->lock); - bio_list_for_each(bio, &pkt->orig_bios) { - int first_frame = (bio->bi_iter.bi_sector - pkt->sector) / - (CD_FRAMESIZE >> 9); - int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE; - pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); - BUG_ON(first_frame < 0); - BUG_ON(first_frame + num_frames > pkt->frames); - for (f = first_frame; f < first_frame + num_frames; f++) - written[f] = 1; - } - spin_unlock(&pkt->lock); - - if (pkt->cache_valid) { - dev_dbg(ddev, "zone %llx cached\n", pkt->sector); - goto out_account; - } - - /* - * Schedule reads for missing parts of the packet. - */ - for (f = 0; f < pkt->frames; f++) { - int p, offset; - - if (written[f]) - continue; - - bio = pkt->r_bios[f]; - bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1, - REQ_OP_READ); - bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); - bio->bi_end_io = pkt_end_io_read; - bio->bi_private = pkt; - - p = (f * CD_FRAMESIZE) / PAGE_SIZE; - offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f, - pkt->pages[p], offset); - if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) - BUG(); - - atomic_inc(&pkt->io_wait); - pkt_queue_bio(pd, bio); - frames_read++; - } - -out_account: - dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, pkt->sector); - pd->stats.pkt_started++; - pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); -} - -/* - * Find a packet matching zone, or the least recently used packet if - * there is no match. - */ -static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone) -{ - struct packet_data *pkt; - - list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) { - if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) { - list_del_init(&pkt->list); - if (pkt->sector != zone) - pkt->cache_valid = 0; - return pkt; - } - } - BUG(); - return NULL; -} - -static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - if (pkt->cache_valid) { - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - } else { - list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list); - } -} - -static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt, - enum packet_data_state state) -{ - static const char *state_name[] = { - "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" - }; - enum packet_data_state old_state = pkt->state; - - dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n", - pkt->id, pkt->sector, state_name[old_state], state_name[state]); - - pkt->state = state; -} - -/* - * Scan the work queue to see if we can start a new packet. - * returns non-zero if any work was done. - */ -static int pkt_handle_queue(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt, *p; - struct bio *bio = NULL; - sector_t zone = 0; /* Suppress gcc warning */ - struct pkt_rb_node *node, *first_node; - struct rb_node *n; - - atomic_set(&pd->scan_queue, 0); - - if (list_empty(&pd->cdrw.pkt_free_list)) { - dev_dbg(ddev, "no pkt\n"); - return 0; - } - - /* - * Try to find a zone we are not already working on. - */ - spin_lock(&pd->lock); - first_node = pkt_rbtree_find(pd, pd->current_sector); - if (!first_node) { - n = rb_first(&pd->bio_queue); - if (n) - first_node = rb_entry(n, struct pkt_rb_node, rb_node); - } - node = first_node; - while (node) { - bio = node->bio; - zone = get_zone(bio->bi_iter.bi_sector, pd); - list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { - if (p->sector == zone) { - bio = NULL; - goto try_next_bio; - } - } - break; -try_next_bio: - node = pkt_rbtree_next(node); - if (!node) { - n = rb_first(&pd->bio_queue); - if (n) - node = rb_entry(n, struct pkt_rb_node, rb_node); - } - if (node == first_node) - node = NULL; - } - spin_unlock(&pd->lock); - if (!bio) { - dev_dbg(ddev, "no bio\n"); - return 0; - } - - pkt = pkt_get_packet_data(pd, zone); - - pd->current_sector = zone + pd->settings.size; - pkt->sector = zone; - BUG_ON(pkt->frames != pd->settings.size >> 2); - pkt->write_size = 0; - - /* - * Scan work queue for bios in the same zone and link them - * to this packet. - */ - spin_lock(&pd->lock); - dev_dbg(ddev, "looking for zone %llx\n", zone); - while ((node = pkt_rbtree_find(pd, zone)) != NULL) { - sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd); - - bio = node->bio; - dev_dbg(ddev, "found zone=%llx\n", tmp); - if (tmp != zone) - break; - pkt_rbtree_erase(pd, node); - spin_lock(&pkt->lock); - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE; - spin_unlock(&pkt->lock); - } - /* check write congestion marks, and if bio_queue_size is - * below, wake up any waiters - */ - if (pd->congested && - pd->bio_queue_size <= pd->write_congestion_off) { - pd->congested = false; - wake_up_var(&pd->congested); - } - spin_unlock(&pd->lock); - - pkt->sleep_time = max(PACKET_WAIT_TIME, 1); - pkt_set_state(ddev, pkt, PACKET_WAITING_STATE); - atomic_set(&pkt->run_sm, 1); - - spin_lock(&pd->cdrw.active_list_lock); - list_add(&pkt->list, &pd->cdrw.pkt_active_list); - spin_unlock(&pd->cdrw.active_list_lock); - - return 1; -} - -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -static void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} - -/* - * Assemble a bio to write one packet and queue the bio for processing - * by the underlying block device. - */ -static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - int f; - - bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs, - pkt->frames, REQ_OP_WRITE); - pkt->w_bio->bi_iter.bi_sector = pkt->sector; - pkt->w_bio->bi_end_io = pkt_end_io_packet_write; - pkt->w_bio->bi_private = pkt; - - /* XXX: locking? */ - for (f = 0; f < pkt->frames; f++) { - struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; - unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - - if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) - BUG(); - } - dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt); - - /* - * Fill-in bvec with data from orig_bios. - */ - spin_lock(&pkt->lock); - bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); - - pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE); - spin_unlock(&pkt->lock); - - dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, pkt->sector); - - if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) - pkt->cache_valid = 1; - else - pkt->cache_valid = 0; - - /* Start the write request */ - atomic_set(&pkt->io_wait, 1); - pkt_queue_bio(pd, pkt->w_bio); -} - -static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) -{ - struct bio *bio; - - if (status) - pkt->cache_valid = 0; - - /* Finish all bios corresponding to this packet */ - while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_status = status; - bio_endio(bio); - } -} - -static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - - dev_dbg(ddev, "pkt %d\n", pkt->id); - - for (;;) { - switch (pkt->state) { - case PACKET_WAITING_STATE: - if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0)) - return; - - pkt->sleep_time = 0; - pkt_gather_data(pd, pkt); - pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE); - break; - - case PACKET_READ_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (atomic_read(&pkt->io_errors) > 0) { - pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); - } else { - pkt_start_write(pd, pkt); - } - break; - - case PACKET_WRITE_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (!pkt->w_bio->bi_status) { - pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); - } else { - pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); - } - break; - - case PACKET_RECOVERY_STATE: - dev_dbg(ddev, "No recovery possible\n"); - pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); - break; - - case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_status); - return; - - default: - BUG(); - break; - } - } -} - -static void pkt_handle_packets(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt, *next; - - /* - * Run state machine for active packets - */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) { - atomic_set(&pkt->run_sm, 0); - pkt_run_state_machine(pd, pkt); - } - } - - /* - * Move no longer active packets to the free list - */ - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) { - if (pkt->state == PACKET_FINISHED_STATE) { - list_del(&pkt->list); - pkt_put_packet_data(pd, pkt); - pkt_set_state(ddev, pkt, PACKET_IDLE_STATE); - atomic_set(&pd->scan_queue, 1); - } - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -/* - * kcdrwd is woken up when writes have been queued for one of our - * registered devices - */ -static int kcdrwd(void *foobar) -{ - struct pktcdvd_device *pd = foobar; - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt; - int states[PACKET_NUM_STATES]; - long min_sleep_time, residue; - - set_user_nice(current, MIN_NICE); - set_freezable(); - - for (;;) { - DECLARE_WAITQUEUE(wait, current); - - /* - * Wait until there is something to do - */ - add_wait_queue(&pd->wqueue, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - /* Check if we need to run pkt_handle_queue */ - if (atomic_read(&pd->scan_queue) > 0) - goto work_to_do; - - /* Check if we need to run the state machine for some packet */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) - goto work_to_do; - } - - /* Check if we need to process the iosched queues */ - if (atomic_read(&pd->iosched.attention) != 0) - goto work_to_do; - - /* Otherwise, go to sleep */ - pkt_count_states(pd, states); - dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - min_sleep_time = MAX_SCHEDULE_TIMEOUT; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sleep_time && pkt->sleep_time < min_sleep_time) - min_sleep_time = pkt->sleep_time; - } - - dev_dbg(ddev, "sleeping\n"); - residue = schedule_timeout(min_sleep_time); - dev_dbg(ddev, "wake up\n"); - - /* make swsusp happy with our thread */ - try_to_freeze(); - - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (!pkt->sleep_time) - continue; - pkt->sleep_time -= min_sleep_time - residue; - if (pkt->sleep_time <= 0) { - pkt->sleep_time = 0; - atomic_inc(&pkt->run_sm); - } - } - - if (kthread_should_stop()) - break; - } -work_to_do: - set_current_state(TASK_RUNNING); - remove_wait_queue(&pd->wqueue, &wait); - - if (kthread_should_stop()) - break; - - /* - * if pkt_handle_queue returns true, we can queue - * another request. - */ - while (pkt_handle_queue(pd)) - ; - - /* - * Handle packet state machine - */ - pkt_handle_packets(pd); - - /* - * Handle iosched queues - */ - pkt_iosched_process_queue(pd); - } - - return 0; -} - -static void pkt_print_settings(struct pktcdvd_device *pd) -{ - dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n", - pd->settings.fp ? "Fixed" : "Variable", - pd->settings.size >> 2, - pd->settings.block_mode == 8 ? '1' : '2'); -} - -static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - - cgc->cmd[0] = GPCMD_MODE_SENSE_10; - cgc->cmd[2] = page_code | (page_control << 6); - put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); - cgc->data_direction = CGC_DATA_READ; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - memset(cgc->buffer, 0, 2); - cgc->cmd[0] = GPCMD_MODE_SELECT_10; - cgc->cmd[1] = 0x10; /* PF */ - put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); - cgc->data_direction = CGC_DATA_WRITE; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) -{ - struct packet_command cgc; - int ret; - - /* set up command and get the disc info */ - init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_DISC_INFO; - cgc.cmd[8] = cgc.buflen = 2; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - /* not all drives have the same disc_info length, so requeue - * packet with the length the drive tells us it can supply - */ - cgc.buflen = be16_to_cpu(di->disc_information_length) + - sizeof(di->disc_information_length); - - if (cgc.buflen > sizeof(disc_information)) - cgc.buflen = sizeof(disc_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti) -{ - struct packet_command cgc; - int ret; - - init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; - cgc.cmd[1] = type & 3; - put_unaligned_be16(track, &cgc.cmd[4]); - cgc.cmd[8] = 8; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - cgc.buflen = be16_to_cpu(ti->track_information_length) + - sizeof(ti->track_information_length); - - if (cgc.buflen > sizeof(track_information)) - cgc.buflen = sizeof(track_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, - long *last_written) -{ - disc_information di; - track_information ti; - __u32 last_track; - int ret; - - ret = pkt_get_disc_info(pd, &di); - if (ret) - return ret; - - last_track = (di.last_track_msb << 8) | di.last_track_lsb; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - - /* if this track is blank, try the previous. */ - if (ti.blank) { - last_track--; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - } - - /* if last recorded field is valid, return it. */ - if (ti.lra_v) { - *last_written = be32_to_cpu(ti.last_rec_address); - } else { - /* make it up instead */ - *last_written = be32_to_cpu(ti.track_start) + - be32_to_cpu(ti.track_size); - if (ti.free_blocks) - *last_written -= (be32_to_cpu(ti.free_blocks) + 7); - } - return 0; -} - -/* - * write mode select package based on pd->settings - */ -static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - write_param_page *wp; - char buffer[128]; - int ret, size; - - /* doesn't apply to DVD+RW or DVD-RAM */ - if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12)) - return 0; - - memset(buffer, 0, sizeof(buffer)); - init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - size = 2 + get_unaligned_be16(&buffer[0]); - pd->mode_offset = get_unaligned_be16(&buffer[6]); - if (size > sizeof(buffer)) - size = sizeof(buffer); - - /* - * now get it all - */ - init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - /* - * write page is offset header + block descriptor length - */ - wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset]; - - wp->fp = pd->settings.fp; - wp->track_mode = pd->settings.track_mode; - wp->write_type = pd->settings.write_type; - wp->data_block_type = pd->settings.block_mode; - - wp->multi_session = 0; - -#ifdef PACKET_USE_LS - wp->link_size = 7; - wp->ls_v = 1; -#endif - - if (wp->data_block_type == PACKET_BLOCK_MODE1) { - wp->session_format = 0; - wp->subhdr2 = 0x20; - } else if (wp->data_block_type == PACKET_BLOCK_MODE2) { - wp->session_format = 0x20; - wp->subhdr2 = 8; -#if 0 - wp->mcn[0] = 0x80; - memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1); -#endif - } else { - /* - * paranoia - */ - dev_err(ddev, "write mode wrong %d\n", wp->data_block_type); - return 1; - } - wp->packet_size = cpu_to_be32(pd->settings.size >> 2); - - cgc.buflen = cgc.cmd[8] = size; - ret = pkt_mode_select(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - pkt_print_settings(pd); - return 0; -} - -/* - * 1 -- we can write to this track, 0 -- we can't - */ -static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) -{ - struct device *ddev = disk_to_dev(pd->disk); - - switch (pd->mmc3_profile) { - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - /* The track is always writable on DVD+RW/DVD-RAM */ - return 1; - default: - break; - } - - if (!ti->packet || !ti->fp) - return 0; - - /* - * "good" settings as per Mt Fuji. - */ - if (ti->rt == 0 && ti->blank == 0) - return 1; - - if (ti->rt == 0 && ti->blank == 1) - return 1; - - if (ti->rt == 1 && ti->blank == 0) - return 1; - - dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); - return 0; -} - -/* - * 1 -- we can write to this disc, 0 -- we can't - */ -static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) -{ - struct device *ddev = disk_to_dev(pd->disk); - - switch (pd->mmc3_profile) { - case 0x0a: /* CD-RW */ - case 0xffff: /* MMC3 not supported */ - break; - case 0x1a: /* DVD+RW */ - case 0x13: /* DVD-RW */ - case 0x12: /* DVD-RAM */ - return 1; - default: - dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile); - return 0; - } - - /* - * for disc type 0xff we should probably reserve a new track. - * but i'm not sure, should we leave this to user apps? probably. - */ - if (di->disc_type == 0xff) { - dev_notice(ddev, "unknown disc - no track?\n"); - return 0; - } - - if (di->disc_type != 0x20 && di->disc_type != 0) { - dev_err(ddev, "wrong disc type (%x)\n", di->disc_type); - return 0; - } - - if (di->erasable == 0) { - dev_err(ddev, "disc not erasable\n"); - return 0; - } - - if (di->border_status == PACKET_SESSION_RESERVED) { - dev_err(ddev, "can't write to last track (reserved)\n"); - return 0; - } - - return 1; -} - -static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - unsigned char buf[12]; - disc_information di; - track_information ti; - int ret, track; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_GET_CONFIGURATION; - cgc.cmd[8] = 8; - ret = pkt_generic_packet(pd, &cgc); - pd->mmc3_profile = ret ? 0xffff : get_unaligned_be16(&buf[6]); - - memset(&di, 0, sizeof(disc_information)); - memset(&ti, 0, sizeof(track_information)); - - ret = pkt_get_disc_info(pd, &di); - if (ret) { - dev_err(ddev, "failed get_disc\n"); - return ret; - } - - if (!pkt_writable_disc(pd, &di)) - return -EROFS; - - pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; - - track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ - ret = pkt_get_track_info(pd, track, 1, &ti); - if (ret) { - dev_err(ddev, "failed get_track\n"); - return ret; - } - - if (!pkt_writable_track(pd, &ti)) { - dev_err(ddev, "can't write to this track\n"); - return -EROFS; - } - - /* - * we keep packet size in 512 byte units, makes it easier to - * deal with request calculations. - */ - pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; - if (pd->settings.size == 0) { - dev_notice(ddev, "detected zero packet size!\n"); - return -ENXIO; - } - if (pd->settings.size > PACKET_MAX_SECTORS) { - dev_err(ddev, "packet size is too big\n"); - return -EROFS; - } - pd->settings.fp = ti.fp; - pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1); - - if (ti.nwa_v) { - pd->nwa = be32_to_cpu(ti.next_writable); - set_bit(PACKET_NWA_VALID, &pd->flags); - } - - /* - * in theory we could use lra on -RW media as well and just zero - * blocks that haven't been written yet, but in practice that - * is just a no-go. we'll use that for -R, naturally. - */ - if (ti.lra_v) { - pd->lra = be32_to_cpu(ti.last_rec_address); - set_bit(PACKET_LRA_VALID, &pd->flags); - } else { - pd->lra = 0xffffffff; - set_bit(PACKET_LRA_VALID, &pd->flags); - } - - /* - * fine for now - */ - pd->settings.link_loss = 7; - pd->settings.write_type = 0; /* packet */ - pd->settings.track_mode = ti.track_mode; - - /* - * mode1 or mode2 disc - */ - switch (ti.data_mode) { - case PACKET_MODE1: - pd->settings.block_mode = PACKET_BLOCK_MODE1; - break; - case PACKET_MODE2: - pd->settings.block_mode = PACKET_BLOCK_MODE2; - break; - default: - dev_err(ddev, "unknown data mode\n"); - return -EROFS; - } - return 0; -} - -/* - * enable/disable write caching on drive - */ -static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE); - int ret; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.buflen = pd->mode_offset + 12; - - /* - * caching mode page might not be there, so quiet this command - */ - cgc.quiet = 1; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); - if (ret) - return ret; - - /* - * use drive write caching -- we need deferred error handling to be - * able to successfully recover with this option (drive will return good - * status as soon as the cdb is validated). - */ - buf[pd->mode_offset + 10] |= (set << 2); - - cgc.buflen = cgc.cmd[8] = 2 + get_unaligned_be16(&buf[0]); - ret = pkt_mode_select(pd, &cgc); - if (ret) { - dev_err(ddev, "write caching control failed\n"); - pkt_dump_sense(pd, &cgc); - } else if (!ret && set) - dev_notice(ddev, "enabled write caching\n"); - return ret; -} - -static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL; - cgc.cmd[4] = lockflag ? 1 : 0; - return pkt_generic_packet(pd, &cgc); -} - -/* - * Returns drive maximum write speed - */ -static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, - unsigned *write_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[256+18]; - unsigned char *cap_buf; - int ret, offset; - - cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); - cgc.sshdr = &sshdr; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - cgc.buflen = pd->mode_offset + cap_buf[1] + 2 + - sizeof(struct mode_page_header); - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - } - - offset = 20; /* Obsoleted field, used by older drives */ - if (cap_buf[1] >= 28) - offset = 28; /* Current write speed selected */ - if (cap_buf[1] >= 30) { - /* If the drive reports at least one "Logical Unit Write - * Speed Performance Descriptor Block", use the information - * in the first block. (contains the highest speed) - */ - int num_spdb = get_unaligned_be16(&cap_buf[30]); - if (num_spdb > 0) - offset = 34; - } - - *write_speed = get_unaligned_be16(&cap_buf[offset]); - return 0; -} - -/* These tables from cdrecord - I don't have orange book */ -/* standard speed CD-RW (1-4x) */ -static char clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* high speed CD-RW (-10x) */ -static char hs_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* ultra high speed CD-RW */ -static char us_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0 -}; - -/* - * reads the maximum media speed from ATIP - */ -static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, - unsigned *speed) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - unsigned int size, st, sp; - int ret; - - init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; /* READ ATIP */ - cgc.cmd[8] = 2; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - size = 2 + get_unaligned_be16(&buf[0]); - if (size > sizeof(buf)) - size = sizeof(buf); - - init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; - cgc.cmd[8] = size; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - if (!(buf[6] & 0x40)) { - dev_notice(ddev, "disc type is not CD-RW\n"); - return 1; - } - if (!(buf[6] & 0x4)) { - dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n"); - return 1; - } - - st = (buf[6] >> 3) & 0x7; /* disc sub-type */ - - sp = buf[16] & 0xf; /* max speed from ATIP A1 field */ - - /* Info from cdrecord */ - switch (st) { - case 0: /* standard speed */ - *speed = clv_to_speed[sp]; - break; - case 1: /* high speed */ - *speed = hs_clv_to_speed[sp]; - break; - case 2: /* ultra high speed */ - *speed = us_clv_to_speed[sp]; - break; - default: - dev_notice(ddev, "unknown disc sub-type %d\n", st); - return 1; - } - if (*speed) { - dev_info(ddev, "maximum media speed: %d\n", *speed); - return 0; - } else { - dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st); - return 1; - } -} - -static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - dev_dbg(ddev, "Performing OPC\n"); - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.timeout = 60*HZ; - cgc.cmd[0] = GPCMD_SEND_OPC; - cgc.cmd[1] = 1; - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - return ret; -} - -static int pkt_open_write(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - int ret; - unsigned int write_speed, media_write_speed, read_speed; - - ret = pkt_probe_settings(pd); - if (ret) { - dev_dbg(ddev, "failed probe\n"); - return ret; - } - - ret = pkt_set_write_settings(pd); - if (ret) { - dev_notice(ddev, "failed saving write settings\n"); - return -EIO; - } - - pkt_write_caching(pd); - - ret = pkt_get_max_speed(pd, &write_speed); - if (ret) - write_speed = 16 * 177; - switch (pd->mmc3_profile) { - case 0x13: /* DVD-RW */ - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - dev_notice(ddev, "write speed %ukB/s\n", write_speed); - break; - default: - ret = pkt_media_speed(pd, &media_write_speed); - if (ret) - media_write_speed = 16; - write_speed = min(write_speed, media_write_speed * 177); - dev_notice(ddev, "write speed %ux\n", write_speed / 176); - break; - } - read_speed = write_speed; - - ret = pkt_set_speed(pd, write_speed, read_speed); - if (ret) { - dev_notice(ddev, "couldn't set write speed\n"); - return -EIO; - } - pd->write_speed = write_speed; - pd->read_speed = read_speed; - - ret = pkt_perform_opc(pd); - if (ret) - dev_notice(ddev, "Optimum Power Calibration failed\n"); - - return 0; -} - -/* - * called at open time. - */ -static int pkt_open_dev(struct pktcdvd_device *pd, bool write) -{ - struct device *ddev = disk_to_dev(pd->disk); - int ret; - long lba; - struct request_queue *q; - struct file *bdev_file; - - /* - * We need to re-open the cdrom device without O_NONBLOCK to be able - * to read/write from/to it. It is already opened in O_NONBLOCK mode - * so open should not fail. - */ - bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev, - BLK_OPEN_READ, pd, NULL); - if (IS_ERR(bdev_file)) { - ret = PTR_ERR(bdev_file); - goto out; - } - pd->f_open_bdev = bdev_file; - - ret = pkt_get_last_written(pd, &lba); - if (ret) { - dev_err(ddev, "pkt_get_last_written failed\n"); - goto out_putdev; - } - - set_capacity(pd->disk, lba << 2); - set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2); - - q = bdev_get_queue(file_bdev(pd->bdev_file)); - if (write) { - ret = pkt_open_write(pd); - if (ret) - goto out_putdev; - set_bit(PACKET_WRITABLE, &pd->flags); - } else { - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - clear_bit(PACKET_WRITABLE, &pd->flags); - } - - ret = pkt_set_segment_merging(pd, q); - if (ret) - goto out_putdev; - - if (write) { - if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - dev_err(ddev, "not enough memory for buffers\n"); - ret = -ENOMEM; - goto out_putdev; - } - dev_info(ddev, "%lukB available on disc\n", lba << 1); - } - set_blocksize(bdev_file, CD_FRAMESIZE); - - return 0; - -out_putdev: - fput(bdev_file); -out: - return ret; -} - -/* - * called when the device is closed. makes sure that the device flushes - * the internal cache before we close. - */ -static void pkt_release_dev(struct pktcdvd_device *pd, int flush) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if (flush && pkt_flush_cache(pd)) - dev_notice(ddev, "not flushing cache\n"); - - pkt_lock_door(pd, 0); - - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - fput(pd->f_open_bdev); - pd->f_open_bdev = NULL; - - pkt_shrink_pktlist(pd); -} - -static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) -{ - if (dev_minor >= MAX_WRITERS) - return NULL; - - dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); - return pkt_devs[dev_minor]; -} - -static int pkt_open(struct gendisk *disk, blk_mode_t mode) -{ - struct pktcdvd_device *pd = NULL; - int ret; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd = pkt_find_dev_from_minor(disk->first_minor); - if (!pd) { - ret = -ENODEV; - goto out; - } - BUG_ON(pd->refcnt < 0); - - pd->refcnt++; - if (pd->refcnt > 1) { - if ((mode & BLK_OPEN_WRITE) && - !test_bit(PACKET_WRITABLE, &pd->flags)) { - ret = -EBUSY; - goto out_dec; - } - } else { - ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE); - if (ret) - goto out_dec; - } - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return 0; - -out_dec: - pd->refcnt--; -out: - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return ret; -} - -static void pkt_release(struct gendisk *disk) -{ - struct pktcdvd_device *pd = disk->private_data; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd->refcnt--; - BUG_ON(pd->refcnt < 0); - if (pd->refcnt == 0) { - int flush = test_bit(PACKET_WRITABLE, &pd->flags); - pkt_release_dev(pd, flush); - } - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); -} - - -static void pkt_end_io_read_cloned(struct bio *bio) -{ - struct packet_stacked_data *psd = bio->bi_private; - struct pktcdvd_device *pd = psd->pd; - - psd->bio->bi_status = bio->bi_status; - bio_put(bio); - bio_endio(psd->bio); - mempool_free(psd, &psd_pool); - pkt_bio_finished(pd); -} - -static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) -{ - struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio, - GFP_NOIO, &pkt_bio_set); - struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); - - psd->pd = pd; - psd->bio = bio; - cloned_bio->bi_private = psd; - cloned_bio->bi_end_io = pkt_end_io_read_cloned; - pd->stats.secs_r += bio_sectors(bio); - pkt_queue_bio(pd, cloned_bio); -} - -static void pkt_make_request_write(struct bio *bio) -{ - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; - sector_t zone; - struct packet_data *pkt; - int was_empty, blocked_bio; - struct pkt_rb_node *node; - - zone = get_zone(bio->bi_iter.bi_sector, pd); - - /* - * If we find a matching packet in state WAITING or READ_WAIT, we can - * just append this bio to that packet. - */ - spin_lock(&pd->cdrw.active_list_lock); - blocked_bio = 0; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sector == zone) { - spin_lock(&pkt->lock); - if ((pkt->state == PACKET_WAITING_STATE) || - (pkt->state == PACKET_READ_WAIT_STATE)) { - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += - bio->bi_iter.bi_size / CD_FRAMESIZE; - if ((pkt->write_size >= pkt->frames) && - (pkt->state == PACKET_WAITING_STATE)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - spin_unlock(&pkt->lock); - spin_unlock(&pd->cdrw.active_list_lock); - return; - } else { - blocked_bio = 1; - } - spin_unlock(&pkt->lock); - } - } - spin_unlock(&pd->cdrw.active_list_lock); - - /* - * Test if there is enough room left in the bio work queue - * (queue size >= congestion on mark). - * If not, wait till the work queue size is below the congestion off mark. - */ - spin_lock(&pd->lock); - if (pd->write_congestion_on > 0 - && pd->bio_queue_size >= pd->write_congestion_on) { - struct wait_bit_queue_entry wqe; - - init_wait_var_entry(&wqe, &pd->congested, 0); - for (;;) { - prepare_to_wait_event(__var_waitqueue(&pd->congested), - &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - if (pd->bio_queue_size <= pd->write_congestion_off) - break; - pd->congested = true; - spin_unlock(&pd->lock); - schedule(); - spin_lock(&pd->lock); - } - } - spin_unlock(&pd->lock); - - /* - * No matching packet found. Store the bio in the work queue. - */ - node = mempool_alloc(&pd->rb_pool, GFP_NOIO); - node->bio = bio; - spin_lock(&pd->lock); - BUG_ON(pd->bio_queue_size < 0); - was_empty = (pd->bio_queue_size == 0); - pkt_rbtree_insert(pd, node); - spin_unlock(&pd->lock); - - /* - * Wake up the worker thread. - */ - atomic_set(&pd->scan_queue, 1); - if (was_empty) { - /* This wake_up is required for correct operation */ - wake_up(&pd->wqueue); - } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) { - /* - * This wake up is not required for correct operation, - * but improves performance in some cases. - */ - wake_up(&pd->wqueue); - } -} - -static void pkt_submit_bio(struct bio *bio) -{ - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; - struct device *ddev = disk_to_dev(pd->disk); - struct bio *split; - - bio = bio_split_to_limits(bio); - if (!bio) - return; - - dev_dbg(ddev, "start = %6llx stop = %6llx\n", - bio->bi_iter.bi_sector, bio_end_sector(bio)); - - /* - * Clone READ bios so we can have our own bi_end_io callback. - */ - if (bio_data_dir(bio) == READ) { - pkt_make_request_read(pd, bio); - return; - } - - if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - dev_notice(ddev, "WRITE for ro device (%llu)\n", bio->bi_iter.bi_sector); - goto end_io; - } - - if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { - dev_err(ddev, "wrong bio size\n"); - goto end_io; - } - - do { - sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); - sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); - - if (last_zone != zone) { - BUG_ON(last_zone != zone + pd->settings.size); - - split = bio_split(bio, last_zone - - bio->bi_iter.bi_sector, - GFP_NOIO, &pkt_bio_set); - bio_chain(split, bio); - } else { - split = bio; - } - - pkt_make_request_write(split); - } while (split != bio); - - return; -end_io: - bio_io_error(bio); -} - -static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) -{ - struct device *ddev = disk_to_dev(pd->disk); - int i; - struct file *bdev_file; - struct scsi_device *sdev; - - if (pd->pkt_dev == dev) { - dev_err(ddev, "recursive setup not allowed\n"); - return -EBUSY; - } - for (i = 0; i < MAX_WRITERS; i++) { - struct pktcdvd_device *pd2 = pkt_devs[i]; - if (!pd2) - continue; - if (file_bdev(pd2->bdev_file)->bd_dev == dev) { - dev_err(ddev, "%pg already setup\n", - file_bdev(pd2->bdev_file)); - return -EBUSY; - } - if (pd2->pkt_dev == dev) { - dev_err(ddev, "can't chain pktcdvd devices\n"); - return -EBUSY; - } - } - - bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, - NULL, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue); - if (!sdev) { - fput(bdev_file); - return -EINVAL; - } - put_device(&sdev->sdev_gendev); - - /* This is safe, since we have a reference from open(). */ - __module_get(THIS_MODULE); - - pd->bdev_file = bdev_file; - - atomic_set(&pd->cdrw.pending_bios, 0); - pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); - if (IS_ERR(pd->cdrw.thread)) { - dev_err(ddev, "can't start kernel thread\n"); - goto out_mem; - } - - proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd); - dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file)); - return 0; - -out_mem: - fput(bdev_file); - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - return -ENOMEM; -} - -static int pkt_ioctl(struct block_device *bdev, blk_mode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct pktcdvd_device *pd = bdev->bd_disk->private_data; - struct device *ddev = disk_to_dev(pd->disk); - int ret; - - dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - - mutex_lock(&pktcdvd_mutex); - switch (cmd) { - case CDROMEJECT: - /* - * The door gets locked when the device is opened, so we - * have to unlock it or else the eject command fails. - */ - if (pd->refcnt == 1) - pkt_lock_door(pd, 0); - fallthrough; - /* - * forward selected CDROM ioctls to CD-ROM, for UDF - */ - case CDROMMULTISESSION: - case CDROMREADTOCENTRY: - case CDROM_LAST_WRITTEN: - case CDROM_SEND_PACKET: - case SCSI_IOCTL_SEND_COMMAND: - if (!bdev->bd_disk->fops->ioctl) - ret = -ENOTTY; - else - ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); - break; - default: - dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd); - ret = -ENOTTY; - } - mutex_unlock(&pktcdvd_mutex); - - return ret; -} - -static unsigned int pkt_check_events(struct gendisk *disk, - unsigned int clearing) -{ - struct pktcdvd_device *pd = disk->private_data; - struct gendisk *attached_disk; - - if (!pd) - return 0; - if (!pd->bdev_file) - return 0; - attached_disk = file_bdev(pd->bdev_file)->bd_disk; - if (!attached_disk || !attached_disk->fops->check_events) - return 0; - return attached_disk->fops->check_events(attached_disk, clearing); -} - -static char *pkt_devnode(struct gendisk *disk, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); -} - -static const struct block_device_operations pktcdvd_ops = { - .owner = THIS_MODULE, - .submit_bio = pkt_submit_bio, - .open = pkt_open, - .release = pkt_release, - .ioctl = pkt_ioctl, - .compat_ioctl = blkdev_compat_ptr_ioctl, - .check_events = pkt_check_events, - .devnode = pkt_devnode, -}; - -/* - * Set up mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) -{ - struct queue_limits lim = { - .max_hw_sectors = PACKET_MAX_SECTORS, - .logical_block_size = CD_FRAMESIZE, - .features = BLK_FEAT_ROTATIONAL, - }; - int idx; - int ret = -ENOMEM; - struct pktcdvd_device *pd; - struct gendisk *disk; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) - if (!pkt_devs[idx]) - break; - if (idx == MAX_WRITERS) { - pr_err("max %d writers supported\n", MAX_WRITERS); - ret = -EBUSY; - goto out_mutex; - } - - pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL); - if (!pd) - goto out_mutex; - - ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE, - sizeof(struct pkt_rb_node)); - if (ret) - goto out_mem; - - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); - INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); - spin_lock_init(&pd->cdrw.active_list_lock); - - spin_lock_init(&pd->lock); - spin_lock_init(&pd->iosched.lock); - bio_list_init(&pd->iosched.read_queue); - bio_list_init(&pd->iosched.write_queue); - init_waitqueue_head(&pd->wqueue); - pd->bio_queue = RB_ROOT; - - pd->write_congestion_on = write_congestion_on; - pd->write_congestion_off = write_congestion_off; - - disk = blk_alloc_disk(&lim, NUMA_NO_NODE); - if (IS_ERR(disk)) { - ret = PTR_ERR(disk); - goto out_mem; - } - pd->disk = disk; - disk->major = pktdev_major; - disk->first_minor = idx; - disk->minors = 1; - disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; - snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx); - disk->private_data = pd; - - pd->pkt_dev = MKDEV(pktdev_major, idx); - ret = pkt_new_dev(pd, dev); - if (ret) - goto out_mem2; - - /* inherit events of the host device */ - disk->events = file_bdev(pd->bdev_file)->bd_disk->events; - - ret = add_disk(disk); - if (ret) - goto out_mem2; - - pkt_sysfs_dev_new(pd); - pkt_debugfs_dev_new(pd); - - pkt_devs[idx] = pd; - if (pkt_dev) - *pkt_dev = pd->pkt_dev; - - mutex_unlock(&ctl_mutex); - return 0; - -out_mem2: - put_disk(disk); -out_mem: - mempool_exit(&pd->rb_pool); - kfree(pd); -out_mutex: - mutex_unlock(&ctl_mutex); - pr_err("setup of pktcdvd device failed\n"); - return ret; -} - -/* - * Tear down mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_remove_dev(dev_t pkt_dev) -{ - struct pktcdvd_device *pd; - struct device *ddev; - int idx; - int ret = 0; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) { - pd = pkt_devs[idx]; - if (pd && (pd->pkt_dev == pkt_dev)) - break; - } - if (idx == MAX_WRITERS) { - pr_debug("dev not setup\n"); - ret = -ENXIO; - goto out; - } - - if (pd->refcnt > 0) { - ret = -EBUSY; - goto out; - } - - ddev = disk_to_dev(pd->disk); - - if (!IS_ERR(pd->cdrw.thread)) - kthread_stop(pd->cdrw.thread); - - pkt_devs[idx] = NULL; - - pkt_debugfs_dev_remove(pd); - pkt_sysfs_dev_remove(pd); - - fput(pd->bdev_file); - - remove_proc_entry(pd->disk->disk_name, pkt_proc); - dev_notice(ddev, "writer unmapped\n"); - - del_gendisk(pd->disk); - put_disk(pd->disk); - - mempool_exit(&pd->rb_pool); - kfree(pd); - - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - -out: - mutex_unlock(&ctl_mutex); - return ret; -} - -static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) -{ - struct pktcdvd_device *pd; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); - if (pd) { - ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev); - ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); - } else { - ctrl_cmd->dev = 0; - ctrl_cmd->pkt_dev = 0; - } - ctrl_cmd->num_devices = MAX_WRITERS; - - mutex_unlock(&ctl_mutex); -} - -static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - struct pkt_ctrl_command ctrl_cmd; - int ret = 0; - dev_t pkt_dev = 0; - - if (cmd != PACKET_CTRL_CMD) - return -ENOTTY; - - if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - - switch (ctrl_cmd.command) { - case PKT_CTRL_CMD_SETUP: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); - ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); - break; - case PKT_CTRL_CMD_TEARDOWN: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); - break; - case PKT_CTRL_CMD_STATUS: - pkt_get_status(&ctrl_cmd); - break; - default: - return -ENOTTY; - } - - if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - return ret; -} - -#ifdef CONFIG_COMPAT -static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); -} -#endif - -static const struct file_operations pkt_ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = pkt_ctl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = pkt_ctl_compat_ioctl, -#endif - .owner = THIS_MODULE, -}; - -static struct miscdevice pkt_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = DRIVER_NAME, - .nodename = "pktcdvd/control", - .fops = &pkt_ctl_fops -}; - -static int __init pkt_init(void) -{ - int ret; - - mutex_init(&ctl_mutex); - - ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE, - sizeof(struct packet_stacked_data)); - if (ret) - return ret; - ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) { - mempool_exit(&psd_pool); - return ret; - } - - ret = register_blkdev(pktdev_major, DRIVER_NAME); - if (ret < 0) { - pr_err("unable to register block device\n"); - goto out2; - } - if (!pktdev_major) - pktdev_major = ret; - - ret = pkt_sysfs_init(); - if (ret) - goto out; - - pkt_debugfs_init(); - - ret = misc_register(&pkt_misc); - if (ret) { - pr_err("unable to register misc device\n"); - goto out_misc; - } - - pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL); - - return 0; - -out_misc: - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); -out: - unregister_blkdev(pktdev_major, DRIVER_NAME); -out2: - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); - return ret; -} - -static void __exit pkt_exit(void) -{ - remove_proc_entry("driver/"DRIVER_NAME, NULL); - misc_deregister(&pkt_misc); - - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); - - unregister_blkdev(pktdev_major, DRIVER_NAME); - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); -} - -MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); -MODULE_AUTHOR("Jens Axboe <axboe@suse.de>"); -MODULE_LICENSE("GPL"); - -module_init(pkt_init); -module_exit(pkt_exit); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 2ee6e9bd4e28..2df8941a6b14 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); - if (bio_add_page(bio, virt_to_page(data), datalen, - offset_in_page(data)) != datalen) { - rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); - err = -EINVAL; - goto bio_put; - } + bio_add_virt_nofail(bio, data, datalen); bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); if (bio_has_data(bio) && diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index b5727dea15bd..7af21fe67671 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -957,8 +957,10 @@ static bool vdc_port_mpgroup_check(struct vio_dev *vdev) dev = device_find_child(vdev->dev.parent, &port_data, vdc_device_probed); - if (dev) + if (dev) { + put_device(dev); return true; + } return false; } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index ee6cade70222..01f7aef3fcfb 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -555,7 +555,7 @@ static void act(struct floppy_state *fs) static void scan_timeout(struct timer_list *t) { - struct floppy_state *fs = from_timer(fs, t, timeout); + struct floppy_state *fs = timer_container_of(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -579,7 +579,7 @@ static void scan_timeout(struct timer_list *t) static void seek_timeout(struct timer_list *t) { - struct floppy_state *fs = from_timer(fs, t, timeout); + struct floppy_state *fs = timer_container_of(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -598,7 +598,7 @@ static void seek_timeout(struct timer_list *t) static void settle_timeout(struct timer_list *t) { - struct floppy_state *fs = from_timer(fs, t, timeout); + struct floppy_state *fs = timer_container_of(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -627,7 +627,7 @@ static void settle_timeout(struct timer_list *t) static void xfer_timeout(struct timer_list *t) { - struct floppy_state *fs = from_timer(fs, t, timeout); + struct floppy_state *fs = timer_container_of(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; struct dbdma_regs __iomem *dr = fs->dma; unsigned long flags; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index dc104c025cd5..99abd67b708b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,8 +48,12 @@ #define UBLK_MINORS (1U << MINORBITS) +#define UBLK_INVALID_BUF_IDX ((u16)-1) + /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) +#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -64,7 +68,12 @@ | UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_USER_COPY \ | UBLK_F_ZONED \ - | UBLK_F_USER_RECOVERY_FAIL_IO) + | UBLK_F_USER_RECOVERY_FAIL_IO \ + | UBLK_F_UPDATE_SIZE \ + | UBLK_F_AUTO_BUF_REG \ + | UBLK_F_QUIESCE \ + | UBLK_F_PER_IO_DAEMON \ + | UBLK_F_BUF_REG_OFF_DAEMON) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -76,10 +85,6 @@ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) -struct ublk_rq_data { - struct kref ref; -}; - struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -99,6 +104,7 @@ struct ublk_uring_cmd_pdu { * setup in ublk uring_cmd handler */ struct ublk_queue *ubq; + u16 tag; }; @@ -131,28 +137,68 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 +/* + * request buffer is registered automatically, so we have to unregister it + * before completing this request. + * + * io_uring will unregister buffer automatically for us during exiting. + */ +#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10 + /* atomic RW with ubq->cancel_lock */ #define UBLK_IO_FLAG_CANCELED 0x80000000 +/* + * Initialize refcount to a large number to include any registered buffers. + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for + * any buffers registered on the io daemon task. + */ +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) + struct ublk_io { /* userspace buffer address from io cmd */ - __u64 addr; + union { + __u64 addr; + struct ublk_auto_buf_reg buf; + }; unsigned int flags; int res; - struct io_uring_cmd *cmd; -}; + union { + /* valid if UBLK_IO_FLAG_ACTIVE is set */ + struct io_uring_cmd *cmd; + /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ + struct request *req; + }; + + struct task_struct *task; + + /* + * The number of uses of this I/O by the ublk server + * if user copy or zero copy are enabled: + * - UBLK_REFCOUNT_INIT from dispatch to the server + * until UBLK_IO_COMMIT_AND_FETCH_REQ + * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each io_uring registered buffer not registered on task + * The I/O can only be completed once all references are dropped. + * User copy and buffer registration operations are only permitted + * if the reference count is nonzero. + */ + refcount_t ref; + /* Count of buffers registered on task and not yet unregistered */ + unsigned task_registered_buffers; + + void *buf_ctx_handle; +} ____cacheline_aligned_in_smp; struct ublk_queue { int q_id; int q_depth; unsigned long flags; - struct task_struct *ubq_daemon; struct ublksrv_io_desc *io_cmd_buf; bool force_abort; - bool timeout; bool canceling; bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ unsigned short nr_io_ready; /* how many ios setup */ @@ -189,7 +235,10 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - unsigned int nr_privileged_daemon; + bool unprivileged_daemons; + struct mutex cancel_mutex; + bool canceling; + pid_t ublksrv_tgid; }; /* header of ublk_params */ @@ -198,13 +247,20 @@ struct ublk_params_header { __u32 types; }; +static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, int tag, size_t offset); + const struct ublk_queue *ubq, struct ublk_io *io, + size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); -static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, - int tag); + +static inline struct ublksrv_io_desc * +ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) +{ + return &ubq->io_cmd_buf[tag]; +} + static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_ZONED; @@ -356,8 +412,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, if (ret) goto free_req; - ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, - GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); if (ret) goto erase_desc; @@ -477,7 +532,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req); -static void ublk_complete_rq(struct kref *ref); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -609,6 +663,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; } +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_USER_COPY; @@ -616,7 +675,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { - return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq); + return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && + !ublk_support_auto_buf_reg(ubq); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -627,42 +687,39 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) * * for zero copy, request buffer need to be registered to io_uring * buffer table, so reference is needed + * + * For auto buffer register, ublk server still may issue + * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, + * so reference is required too. */ - return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq); + return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) || + ublk_support_auto_buf_reg(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, - struct request *req) + struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - kref_init(&data->ref); - } + if (ublk_need_req_ref(ubq)) + refcount_set(&io->ref, UBLK_REFCOUNT_INIT); } -static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_get_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - return kref_get_unless_zero(&data->ref); - } + return refcount_inc_not_zero(&io->ref); +} - return true; +static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) +{ + if (refcount_dec_and_test(&io->ref)) + __ublk_complete_rq(req); } -static inline void ublk_put_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_sub_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; - kref_put(&data->ref, ublk_complete_rq); - } else { - __ublk_complete_rq(req); - } + io->task_registered_buffers = 0; + return refcount_sub_and_test(sub_refs, &io->ref); } static inline bool ublk_need_get_data(const struct ublk_queue *ubq) @@ -695,12 +752,6 @@ static inline bool ublk_rq_has_data(const struct request *rq) return bio_has_data(rq->bio); } -static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, - int tag) -{ - return &ubq->io_cmd_buf[tag]; -} - static inline struct ublksrv_io_desc * ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { @@ -945,7 +996,7 @@ static inline bool ublk_need_unmap_req(const struct request *req) } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -969,7 +1020,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, static int ublk_unmap_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -1064,11 +1115,6 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } -static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) -{ - return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING; -} - /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req) { @@ -1109,7 +1155,7 @@ static inline void __ublk_complete_rq(struct request *req) if (blk_update_request(req, BLK_STS_OK, io->res)) blk_mq_requeue_request(req, true); - else + else if (likely(!blk_should_fake_timeout(req->q))) __blk_mq_end_request(req, BLK_STS_OK); return; @@ -1117,18 +1163,12 @@ exit: blk_mq_end_request(req, res); } -static void ublk_complete_rq(struct kref *ref) +static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, + struct request *req) { - struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, - ref); - struct request *req = blk_mq_rq_from_pdu(data); - - __ublk_complete_rq(req); -} + /* read cmd first because req will overwrite it */ + struct io_uring_cmd *cmd = io->cmd; -static void ubq_complete_io_cmd(struct ublk_io *io, int res, - unsigned issue_flags) -{ /* mark this cmd owned by ublksrv */ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; @@ -1138,8 +1178,17 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res, */ io->flags &= ~UBLK_IO_FLAG_ACTIVE; + io->req = req; + return cmd; +} + +static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, + int res, unsigned issue_flags) +{ + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); + /* tell ublksrv one io request is coming */ - io_uring_cmd_done(io->cmd, res, 0, issue_flags); + io_uring_cmd_done(cmd, res, 0, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -1154,28 +1203,97 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } +static void +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) +{ + unsigned tag = io - ubq->ios; + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); + + iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; +} + +static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, unsigned int issue_flags) +{ + int ret; + + ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, + io->buf.index, issue_flags); + if (ret) { + if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + ublk_auto_buf_reg_fallback(ubq, io); + return true; + } + blk_mq_end_request(req, BLK_STS_IOERR); + return false; + } + + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; + return true; +} + +static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + unsigned int issue_flags) +{ + ublk_init_req_ref(ubq, io); + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) + return ublk_auto_buf_reg(ubq, req, io, issue_flags); + + return true; +} + +static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io) +{ + unsigned mapped_bytes = ublk_map_io(ubq, req, io); + + /* partially mapped, update io descriptor */ + if (unlikely(mapped_bytes != blk_rq_bytes(req))) { + /* + * Nothing mapped, retry until we succeed. + * + * We may never succeed in mapping any bytes here because + * of OOM. TODO: reserve one buffer with single page pinned + * for providing forward progress guarantee. + */ + if (unlikely(!mapped_bytes)) { + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, + UBLK_REQUEUE_DELAY_MS); + return false; + } + + ublk_get_iod(ubq, req->tag)->nr_sectors = + mapped_bytes >> 9; + } + + return true; +} + static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req, unsigned int issue_flags) { int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; - unsigned int mapped_bytes; - pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, + pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); /* * Task is exiting if either: * - * (1) current != ubq_daemon. + * (1) current != io->task. * io_uring_cmd_complete_in_task() tries to run task_work - * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. + * in a workqueue if cmd's task is PF_EXITING. * * (2) current->flags & PF_EXITING. */ - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + if (unlikely(current != io->task || current->flags & PF_EXITING)) { __ublk_abort_rq(ubq, req); return; } @@ -1183,54 +1301,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, - * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv + * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv * and notify it. */ - if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { - io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; - pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", - __func__, io->cmd->cmd_op, ubq->q_id, - req->tag, io->flags); - ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); - return; - } - /* - * We have handled UBLK_IO_NEED_GET_DATA command, - * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just - * do the copy work. - */ - io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; - /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; - pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, - ublk_get_iod(ubq, req->tag)->addr); + io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; + pr_devel("%s: need get data. qid %d tag %d io_flags %x\n", + __func__, ubq->q_id, req->tag, io->flags); + ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA, + issue_flags); + return; } - mapped_bytes = ublk_map_io(ubq, req, io); - - /* partially mapped, update io descriptor */ - if (unlikely(mapped_bytes != blk_rq_bytes(req))) { - /* - * Nothing mapped, retry until we succeed. - * - * We may never succeed in mapping any bytes here because - * of OOM. TODO: reserve one buffer with single page pinned - * for providing forward progress guarantee. - */ - if (unlikely(!mapped_bytes)) { - blk_mq_requeue_request(req, false); - blk_mq_delay_kick_requeue_list(req->q, - UBLK_REQUEUE_DELAY_MS); - return; - } - - ublk_get_iod(ubq, req->tag)->nr_sectors = - mapped_bytes >> 9; - } + if (!ublk_start_io(ubq, req, io)) + return; - ublk_init_req_ref(ubq, req); - ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); + if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) + ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, @@ -1256,24 +1342,22 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; - struct ublk_queue *ubq = pdu->ubq; struct request *next; do { next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(ubq, rq, issue_flags); + ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); rq = next; } while (rq); } -static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) { - struct request *rq = rq_list_peek(l); - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct io_uring_cmd *cmd = io->cmd; struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - pdu->req_list = rq; + pdu->req_list = rq_list_peek(l); rq_list_init(l); io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } @@ -1281,17 +1365,23 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; + pid_t tgid = ubq->dev->ublksrv_tgid; + struct task_struct *p; + struct pid *pid; - if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { - if (!ubq->timeout) { - send_sig(SIGKILL, ubq->ubq_daemon, 0); - ubq->timeout = true; - } + if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) + return BLK_EH_RESET_TIMER; - return BLK_EH_DONE; - } + if (unlikely(!tgid)) + return BLK_EH_RESET_TIMER; - return BLK_EH_RESET_TIMER; + rcu_read_lock(); + pid = find_vpid(tgid); + p = pid_task(pid, PIDTYPE_PID); + if (p) + send_sig(SIGKILL, p, 0); + rcu_read_unlock(); + return BLK_EH_DONE; } static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, @@ -1299,7 +1389,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, { blk_status_t res; - if (unlikely(ubq->fail_io)) + if (unlikely(READ_ONCE(ubq->fail_io))) return BLK_STS_TARGET; /* With recovery feature enabled, force_abort is set in @@ -1311,7 +1401,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && + unlikely(READ_ONCE(ubq->force_abort))) return BLK_STS_IOERR; if (check_cancel && unlikely(ubq->canceling)) @@ -1351,28 +1442,39 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, + const struct ublk_io *io2) +{ + return (io_uring_cmd_ctx_handle(io->cmd) == + io_uring_cmd_ctx_handle(io2->cmd)) && + (io->task == io2->task); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; struct rq_list submit_list = { }; - struct ublk_queue *ubq = NULL; + struct ublk_io *io = NULL; struct request *req; while ((req = rq_list_pop(rqlist))) { struct ublk_queue *this_q = req->mq_hctx->driver_data; + struct ublk_io *this_io = &this_q->ios[req->tag]; - if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) - ublk_queue_cmd_list(ubq, &submit_list); - ubq = this_q; - - if (ublk_prep_req(ubq, req, true) == BLK_STS_OK) - rq_list_add_tail(&submit_list, req); - else + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { rq_list_add_tail(&requeue_list, req); + continue; + } + + if (io && !ublk_belong_to_same_batch(io, this_io) && + !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); + io = this_io; + rq_list_add_tail(&submit_list, req); } - if (ubq && !rq_list_empty(&submit_list)) - ublk_queue_cmd_list(ubq, &submit_list); + if (!rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); *rqlist = requeue_list; } @@ -1400,17 +1502,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) /* All old ioucmds have to be completed */ ubq->nr_io_ready = 0; - /* - * old daemon is PF_EXITING, put it now - * - * It could be NULL in case of closing one quisced device. - */ - if (ubq->ubq_daemon) - put_task_struct(ubq->ubq_daemon); - /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ - ubq->ubq_daemon = NULL; - ubq->timeout = false; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1421,6 +1512,20 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) io->flags &= UBLK_IO_FLAG_CANCELED; io->cmd = NULL; io->addr = 0; + + /* + * old task is PF_EXITING, put it now + * + * It could be NULL in case of closing one quiesced + * device. + */ + if (io->task) { + put_task_struct(io->task); + io->task = NULL; + } + + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); } } @@ -1432,6 +1537,7 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) return -EBUSY; filp->private_data = ub; + ub->ublksrv_tgid = current->tgid; return 0; } @@ -1442,10 +1548,11 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_queue_reinit(ub, ublk_get_queue(ub, i)); - /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; - ub->nr_privileged_daemon = 0; + ub->unprivileged_daemons = false; + ub->ublksrv_tgid = -1; } static struct gendisk *ublk_get_disk(struct ublk_device *ub) @@ -1467,6 +1574,27 @@ static void ublk_put_disk(struct gendisk *disk) put_device(disk_to_dev(disk)); } +/* + * Use this function to ensure that ->canceling is consistently set for + * the device and all queues. Do not set these flags directly. + * + * Caller must ensure that: + * - cancel_mutex is held. This ensures that there is no concurrent + * access to ub->canceling and no concurrent writes to ubq->canceling. + * - there are no concurrent reads of ubq->canceling from the queue_rq + * path. This can be done by quiescing the queue, or through other + * means. + */ +static void ublk_set_canceling(struct ublk_device *ub, bool canceling) + __must_hold(&ub->cancel_mutex) +{ + int i; + + ub->canceling = canceling; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->canceling = canceling; +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; @@ -1495,12 +1623,11 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * All requests may be inflight, so ->canceling may not be set, set * it now. */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = true; - ublk_abort_queue(ub, ubq); - } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, true); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_abort_queue(ub, ublk_get_queue(ub, i)); + mutex_unlock(&ub->cancel_mutex); blk_mq_kick_requeue_list(disk->queue); /* @@ -1518,7 +1645,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * Transition the device to the nosrv state. What exactly this * means depends on the recovery flags */ - blk_mq_quiesce_queue(disk->queue); if (ublk_nosrv_should_stop_dev(ub)) { /* * Allow any pending/future I/O to pass through quickly @@ -1526,8 +1652,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * waits for all pending I/O to complete */ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->force_abort = true; - blk_mq_unquiesce_queue(disk->queue); + WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); ublk_stop_dev_unlocked(ub); } else { @@ -1537,9 +1662,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) } else { ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->fail_io = true; + WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); } - blk_mq_unquiesce_queue(disk->queue); } unlock: mutex_unlock(&ub->mutex); @@ -1590,30 +1714,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } -static void ublk_commit_completion(struct ublk_device *ub, - const struct ublksrv_io_cmd *ub_cmd) -{ - u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; - struct ublk_queue *ubq = ublk_get_queue(ub, qid); - struct ublk_io *io = &ubq->ios[tag]; - struct request *req; - - /* now this cmd slot is owned by nbd driver */ - io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; - io->res = ub_cmd->result; - - /* find the io request and complete */ - req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); - if (WARN_ON_ONCE(unlikely(!req))) - return; - - if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = ub_cmd->zone_append_lba; - - if (likely(!blk_should_fake_timeout(req->q))) - ublk_put_req_ref(ubq, req); -} - static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, struct request *req) { @@ -1642,37 +1742,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { - struct request *rq; - - /* - * Either we fail the request or ublk_rq_task_work_cb - * will do it - */ - rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq && blk_mq_request_started(rq)) - __ublk_fail_req(ubq, io, rq); - } + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + __ublk_fail_req(ubq, io, io->req); } } -/* Must be called when queue is frozen */ -static void ublk_mark_queue_canceling(struct ublk_queue *ubq) +static void ublk_start_cancel(struct ublk_device *ub) { - spin_lock(&ubq->cancel_lock); - if (!ubq->canceling) - ubq->canceling = true; - spin_unlock(&ubq->cancel_lock); -} - -static void ublk_start_cancel(struct ublk_queue *ubq) -{ - struct ublk_device *ub = ubq->dev; struct gendisk *disk = ublk_get_disk(ub); /* Our disk has been dead */ if (!disk) return; + + mutex_lock(&ub->cancel_mutex); + if (ub->canceling) + goto out; /* * Now we are serialized with ublk_queue_rq() * @@ -1681,8 +1766,10 @@ static void ublk_start_cancel(struct ublk_queue *ubq) * touch completed uring_cmd */ blk_mq_quiesce_queue(disk->queue); - ublk_mark_queue_canceling(ubq); + ublk_set_canceling(ub, true); blk_mq_unquiesce_queue(disk->queue); +out: + mutex_unlock(&ub->cancel_mutex); ublk_put_disk(disk); } @@ -1742,6 +1829,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; + struct ublk_io *io; if (WARN_ON_ONCE(!ubq)) return; @@ -1750,13 +1838,13 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, return; task = io_uring_cmd_get_task(cmd); - if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) + io = &ubq->ios[pdu->tag]; + if (WARN_ON_ONCE(task && task != io->task)) return; - if (!ubq->canceling) - ublk_start_cancel(ubq); + ublk_start_cancel(ubq->dev); - WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd); + WARN_ON_ONCE(io->cmd != cmd); ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } @@ -1878,9 +1966,11 @@ static void ublk_reset_io_flags(struct ublk_device *ub) for (j = 0; j < ubq->q_depth; j++) ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; spin_unlock(&ubq->cancel_lock); - ubq->canceling = false; ubq->fail_io = false; } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, false); + mutex_unlock(&ub->cancel_mutex); } /* device can only be started after all IOs are ready */ @@ -1888,14 +1978,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) __must_hold(&ub->mutex) { ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) { - ubq->ubq_daemon = current; - get_task_struct(ubq->ubq_daemon); + if (ublk_queue_ready(ubq)) ub->nr_queues_ready++; - - if (capable(CAP_SYS_ADMIN)) - ub->nr_privileged_daemon++; - } + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) + ub->unprivileged_daemons = true; if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { /* now we are ready for handling ublk io request */ @@ -1917,12 +2003,66 @@ static inline int ublk_check_cmd_op(u32 cmd_op) return 0; } -static inline void ublk_fill_io_cmd(struct ublk_io *io, - struct io_uring_cmd *cmd, unsigned long buf_addr) +static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (io->buf.reserved0 || io->buf.reserved1) + return -EINVAL; + + if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + return -EINVAL; + return 0; +} + +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) { + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { + io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; + + /* + * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` + * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same + * `io_ring_ctx`. + * + * If this uring_cmd's io_ring_ctx isn't same with the + * one for registering the buffer, it is ublk server's + * responsibility for unregistering the buffer, otherwise + * this ublk request gets stuck. + */ + if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) + *buf_idx = io->buf.index; + } + + return ublk_set_auto_buf_reg(io, cmd); +} + +/* Once we return, `io->req` can't be used any more */ +static inline struct request * +ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + struct request *req = io->req; + io->cmd = cmd; io->flags |= UBLK_IO_FLAG_ACTIVE; + /* now this cmd slot is owned by ublk driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + + return req; +} + +static inline int +ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr, + u16 *buf_idx) +{ + if (ublk_support_auto_buf_reg(ubq)) + return ublk_handle_auto_buf_reg(io, cmd, buf_idx); + io->addr = buf_addr; + return 0; } static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, @@ -1944,54 +2084,99 @@ static void ublk_io_release(void *priv) { struct request *rq = priv; struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[rq->tag]; - ublk_put_req_ref(ubq, rq); + /* + * task_registered_buffers may be 0 if buffers were registered off task + * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. + */ + if (current == io->task && io->task_registered_buffers) + io->task_registered_buffers--; + else + ublk_put_req_ref(io, rq); } static int ublk_register_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, unsigned int tag, + const struct ublk_queue *ubq, + struct ublk_io *io, unsigned int index, unsigned int issue_flags) { struct ublk_device *ub = cmd->file->private_data; - const struct ublk_io *io = &ubq->ios[tag]; struct request *req; int ret; if (!ublk_support_zero_copy(ubq)) return -EINVAL; - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - return -EINVAL; - - req = __ublk_check_and_get_req(ub, ubq, tag, 0); + req = __ublk_check_and_get_req(ub, ubq, io, 0); if (!req) return -EINVAL; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, issue_flags); if (ret) { - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } return 0; } -static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, unsigned int tag, - unsigned int index, unsigned int issue_flags) +static int +ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, + const struct ublk_queue *ubq, struct ublk_io *io, + unsigned index, unsigned issue_flags) { - const struct ublk_io *io = &ubq->ios[tag]; + unsigned new_registered_buffers; + struct request *req = io->req; + int ret; - if (!ublk_support_zero_copy(ubq)) + /* + * Ensure there are still references for ublk_sub_req_ref() to release. + * If not, fall back on the thread-safe buffer registration. + */ + new_registered_buffers = io->task_registered_buffers + 1; + if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) + return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); + + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) return -EINVAL; - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) + return ret; + + io->task_registered_buffers = new_registered_buffers; + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublk_device *ub, + unsigned int index, unsigned int issue_flags) +{ + if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; return io_buffer_unregister_bvec(cmd, index, issue_flags); } +static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) +{ + if (ublk_need_map_io(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_need_get_data(ubq)) + return -EINVAL; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + return -EINVAL; + } + return 0; +} + static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, struct ublk_io *io, __u64 buf_addr) { @@ -2018,59 +2203,135 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); - if (ublk_need_map_io(ubq)) { - /* - * FETCH_RQ has to provide IO buffer if NEED GET - * DATA is not enabled - */ - if (!buf_addr && !ublk_need_get_data(ubq)) - goto out; - } else if (buf_addr) { - /* User copy requires addr to be unset */ - ret = -EINVAL; + ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); + if (ret) goto out; - } - ublk_fill_io_cmd(io, cmd, buf_addr); + WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub, ubq); out: mutex_unlock(&ub->mutex); return ret; } +static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, + struct ublk_io *io, __u64 buf_addr) +{ + struct request *req = io->req; + + if (ublk_need_map_io(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!buf_addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + return -EINVAL; + } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ + return -EINVAL; + } + + return 0; +} + +static bool ublk_need_complete_req(const struct ublk_queue *ubq, + struct ublk_io *io) +{ + if (ublk_need_req_ref(ubq)) + return ublk_sub_req_ref(io); + return true; +} + +static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + /* + * We have handled UBLK_IO_NEED_GET_DATA command, + * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just + * do the copy work. + */ + io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; + /* update iod->addr because ublksrv may have passed a new io buffer */ + ublk_get_iod(ubq, req->tag)->addr = io->addr; + pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + return ublk_start_io(ubq, req, io); +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) { + u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; - int ret = -EINVAL; struct request *req; + int ret; + bool compl; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - ubq = ublk_get_queue(ub, ub_cmd->q_id); - if (!ubq || ub_cmd->q_id != ubq->q_id) - goto out; + /* + * io_buffer_unregister_bvec() doesn't access the ubq or io, + * so no need to validate the q_id, tag, or task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, + issue_flags); - if (ubq->ubq_daemon && ubq->ubq_daemon != current) + ret = -EINVAL; + if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; + ubq = ublk_get_queue(ub, ub_cmd->q_id); + if (tag >= ubq->q_depth) goto out; io = &ubq->ios[tag]; + /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ + if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { + ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); + if (ret) + goto out; + ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + if (ret) + goto out; + + ublk_prep_cancel(cmd, issue_flags, ubq, tag); + return -EIOCBQUEUED; + } + + if (READ_ONCE(io->task) != current) { + /* + * ublk_register_io_buf() accesses only the io's refcount, + * so can be handled on any task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) + return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); + + goto out; + } /* there is pending io cmd, something must be wrong */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { ret = -EBUSY; goto out; } @@ -2083,54 +2344,44 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - ret = ublk_check_cmd_op(cmd_op); - if (ret) - goto out; - - ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); - case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); - case UBLK_IO_FETCH_REQ: - ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); - if (ret) - goto out; - break; + return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); + if (ret) goto out; + io->res = ub_cmd->result; + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); + compl = ublk_need_complete_req(ubq, io); + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + if (compl) + __ublk_complete_rq(req); - if (ublk_need_map_io(ubq)) { - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if - * NEED GET DATA is not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || - req_op(req) == REQ_OP_READ)) - goto out; - } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { - /* - * User copy requires addr to be unset when command is - * not zone append - */ - ret = -EINVAL; + if (ret) goto out; - } - - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_commit_completion(ub, ub_cmd); break; case UBLK_IO_NEED_GET_DATA: - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - goto out; - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - ublk_dispatch_req(ubq, req, issue_flags); - return -EIOCBQUEUED; + /* + * ublk_get_data() may fail and fallback to requeue, so keep + * uring_cmd active first and prepare for handling new requeued + * request + */ + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); + WARN_ON_ONCE(ret); + if (likely(ublk_get_data(ubq, io, req))) { + __ublk_prep_compl_io_cmd(io, req); + return UBLK_IO_RES_OK; + } + break; default: goto out; } @@ -2144,15 +2395,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, int tag, size_t offset) + const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) { + unsigned tag = io - ubq->ios; struct request *req; + /* + * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, + * which would overwrite it with io->cmd + */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (!req) return NULL; - if (!ublk_get_req_ref(ubq, req)) + if (!ublk_get_req_ref(io)) return NULL; if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) @@ -2166,7 +2422,7 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, return req; fail_put: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return NULL; } @@ -2233,7 +2489,8 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, } static struct request *ublk_check_and_get_req(struct kiocb *iocb, - struct iov_iter *iter, size_t *off, int dir) + struct iov_iter *iter, size_t *off, int dir, + struct ublk_io **io) { struct ublk_device *ub = iocb->ki_filp->private_data; struct ublk_queue *ubq; @@ -2267,7 +2524,8 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (tag >= ubq->q_depth) return ERR_PTR(-EINVAL); - req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + *io = &ubq->ios[tag]; + req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); if (!req) return ERR_PTR(-EINVAL); @@ -2280,42 +2538,40 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, *off = buf_off; return req; fail: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(*io, req); return ERR_PTR(-EACCES); } static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } @@ -2334,9 +2590,16 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { int size = ublk_queue_cmd_buf_size(ub, q_id); struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + int i; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + if (io->task) + put_task_struct(io->task); + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); + } - if (ubq->ubq_daemon) - put_task_struct(ubq->ubq_daemon); if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); } @@ -2373,7 +2636,7 @@ static void ublk_deinit_queues(struct ublk_device *ub) for (i = 0; i < nr_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); + kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) @@ -2384,7 +2647,7 @@ static int ublk_init_queues(struct ublk_device *ub) int i, ret = -ENOMEM; ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); if (!ub->__queues) return ret; @@ -2440,6 +2703,7 @@ static void ublk_cdev_rel(struct device *dev) ublk_deinit_queues(ub); ublk_free_dev_number(ub); mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); } @@ -2487,7 +2751,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; - ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } @@ -2589,6 +2852,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; + mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { @@ -2610,8 +2876,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, ublk_apply_params(ub); - /* don't probe partitions if any one ubq daemon is un-trusted */ - if (ub->nr_privileged_daemon != ub->nr_queues_ready) + /* don't probe partitions if any daemon task is un-trusted */ + if (ub->unprivileged_daemons) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); ublk_get_device(ub); @@ -2710,6 +2976,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) + return -EINVAL; + if (capable(CAP_SYS_ADMIN)) info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) @@ -2728,6 +2998,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) return -EINVAL; } + if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { + pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); + return -EINVAL; + } + /* * unprivileged device can't be trusted, but RECOVERY and * RECOVERY_REISSUE still may hang error handling, so can't @@ -2744,8 +3019,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) * For USER_COPY, we depends on userspace to fill request * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device + * + * Same with zero copy or auto buffer register. */ - if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) return -EINVAL; } @@ -2781,6 +3059,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); + mutex_init(&ub->cancel_mutex); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2800,10 +3079,13 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags &= UBLK_F_ALL; ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | - UBLK_F_URING_CMD_COMP_IN_TASK; + UBLK_F_URING_CMD_COMP_IN_TASK | + UBLK_F_PER_IO_DAEMON | + UBLK_F_BUF_REG_OFF_DAEMON; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ - if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) + if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; /* @@ -2849,6 +3131,7 @@ out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); out_unlock: mutex_unlock(&ublk_ctl_mutex); @@ -3064,14 +3347,17 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); - /* wait until new ubq_daemon sending all FETCH_REQ */ + pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__, + header->dev_id); + if (wait_for_completion_interruptible(&ub->completion)) return -EINTR; - pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); + pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, + header->dev_id); + + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; mutex_lock(&ub->mutex); if (ublk_nosrv_should_stop_dev(ub)) @@ -3106,6 +3392,125 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) return 0; } +static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) +{ + struct ublk_param_basic *p = &ub->params.basic; + u64 new_size = header->data[0]; + + mutex_lock(&ub->mutex); + p->dev_sectors = new_size; + set_capacity_and_notify(ub->ub_disk, p->dev_sectors); + mutex_unlock(&ub->mutex); +} + +struct count_busy { + const struct ublk_queue *ubq; + unsigned int nr_busy; +}; + +static bool ublk_count_busy_req(struct request *rq, void *data) +{ + struct count_busy *idle = data; + + if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) + idle->nr_busy += 1; + return true; +} + +/* uring_cmd is guaranteed to be active if the associated request is idle */ +static bool ubq_has_idle_io(const struct ublk_queue *ubq) +{ + struct count_busy data = { + .ubq = ubq, + }; + + blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); + return data.nr_busy < ubq->q_depth; +} + +/* Wait until each hw queue has at least one idle IO */ +static int ublk_wait_for_idle_io(struct ublk_device *ub, + unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int ret; + + while (elapsed < timeout_ms && !signal_pending(current)) { + unsigned int queues_cancelable = 0; + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + queues_cancelable += !!ubq_has_idle_io(ubq); + } + + /* + * Each queue needs at least one active command for + * notifying ublk server + */ + if (queues_cancelable == ub->dev_info.nr_hw_queues) + break; + + msleep(UBLK_REQUEUE_DELAY_MS); + elapsed += UBLK_REQUEUE_DELAY_MS; + } + + if (signal_pending(current)) + ret = -EINTR; + else if (elapsed >= timeout_ms) + ret = -EBUSY; + else + ret = 0; + + return ret; +} + +static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) +{ + /* zero means wait forever */ + u64 timeout_ms = header->data[0]; + struct gendisk *disk; + int ret = -ENODEV; + + if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) + return -EOPNOTSUPP; + + mutex_lock(&ub->mutex); + disk = ublk_get_disk(ub); + if (!disk) + goto unlock; + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto put_disk; + + ret = 0; + /* already in expected state */ + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto put_disk; + + /* Mark the device as canceling */ + mutex_lock(&ub->cancel_mutex); + blk_mq_quiesce_queue(disk->queue); + ublk_set_canceling(ub, true); + blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&ub->cancel_mutex); + + if (!timeout_ms) + timeout_ms = UINT_MAX; + ret = ublk_wait_for_idle_io(ub, timeout_ms); + +put_disk: + ublk_put_disk(disk); +unlock: + mutex_unlock(&ub->mutex); + + /* Cancel pending uring_cmd */ + if (!ret) + ublk_cancel_dev(ub); + return ret; +} + /* * All control commands are sent via /dev/ublk-control, so we have to check * the destination device's permission @@ -3191,6 +3596,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_SET_PARAMS: case UBLK_CMD_START_USER_RECOVERY: case UBLK_CMD_END_USER_RECOVERY: + case UBLK_CMD_UPDATE_SIZE: + case UBLK_CMD_QUIESCE_DEV: mask = MAY_READ | MAY_WRITE; break; default: @@ -3282,6 +3689,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_END_USER_RECOVERY: ret = ublk_ctrl_end_recovery(ub, header); break; + case UBLK_CMD_UPDATE_SIZE: + ublk_ctrl_set_size(ub, header); + ret = 0; + break; + case UBLK_CMD_QUIESCE_DEV: + ret = ublk_ctrl_quiesce_dev(ub, header); + break; default: ret = -EOPNOTSUPP; break; @@ -3315,6 +3729,7 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); init_waitqueue_head(&ublk_idr_wq); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7cffea01d868..e649fa67bac1 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); - err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL); + err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL); if (err) goto out; @@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; - err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); + err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); if (err) goto out; @@ -976,9 +976,8 @@ static int init_vq(struct virtio_blk *vblk) return -EINVAL; } - num_vqs = min_t(unsigned int, - min_not_zero(num_request_queues, nr_cpu_ids), - num_vqs); + num_vqs = blk_mq_num_possible_queues( + min_not_zero(num_request_queues, num_vqs)); num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c new file mode 100644 index 000000000000..a423228e201b --- /dev/null +++ b/drivers/block/zloop.c @@ -0,0 +1,1386 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Christoph Hellwig. + * Copyright (c) 2025, Western Digital Corporation or its affiliates. + * + * Zoned Loop Device driver - exports a zoned block device using one file per + * zone as backing storage. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/blk-mq.h> +#include <linux/blkzoned.h> +#include <linux/pagemap.h> +#include <linux/miscdevice.h> +#include <linux/falloc.h> +#include <linux/mutex.h> +#include <linux/parser.h> +#include <linux/seq_file.h> + +/* + * Options for adding (and removing) a device. + */ +enum { + ZLOOP_OPT_ERR = 0, + ZLOOP_OPT_ID = (1 << 0), + ZLOOP_OPT_CAPACITY = (1 << 1), + ZLOOP_OPT_ZONE_SIZE = (1 << 2), + ZLOOP_OPT_ZONE_CAPACITY = (1 << 3), + ZLOOP_OPT_NR_CONV_ZONES = (1 << 4), + ZLOOP_OPT_BASE_DIR = (1 << 5), + ZLOOP_OPT_NR_QUEUES = (1 << 6), + ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), + ZLOOP_OPT_BUFFERED_IO = (1 << 8), +}; + +static const match_table_t zloop_opt_tokens = { + { ZLOOP_OPT_ID, "id=%d" }, + { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" }, + { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" }, + { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" }, + { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" }, + { ZLOOP_OPT_BASE_DIR, "base_dir=%s" }, + { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, + { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, + { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, + { ZLOOP_OPT_ERR, NULL } +}; + +/* Default values for the "add" operation. */ +#define ZLOOP_DEF_ID -1 +#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) +#define ZLOOP_DEF_NR_ZONES 64 +#define ZLOOP_DEF_NR_CONV_ZONES 8 +#define ZLOOP_DEF_BASE_DIR "/var/local/zloop" +#define ZLOOP_DEF_NR_QUEUES 1 +#define ZLOOP_DEF_QUEUE_DEPTH 128 +#define ZLOOP_DEF_BUFFERED_IO false + +/* Arbitrary limit on the zone size (16GB). */ +#define ZLOOP_MAX_ZONE_SIZE_MB 16384 + +struct zloop_options { + unsigned int mask; + int id; + sector_t capacity; + sector_t zone_size; + sector_t zone_capacity; + unsigned int nr_conv_zones; + char *base_dir; + unsigned int nr_queues; + unsigned int queue_depth; + bool buffered_io; +}; + +/* + * Device states. + */ +enum { + Zlo_creating = 0, + Zlo_live, + Zlo_deleting, +}; + +enum zloop_zone_flags { + ZLOOP_ZONE_CONV = 0, + ZLOOP_ZONE_SEQ_ERROR, +}; + +struct zloop_zone { + struct file *file; + + unsigned long flags; + struct mutex lock; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + + gfp_t old_gfp_mask; +}; + +struct zloop_device { + unsigned int id; + unsigned int state; + + struct blk_mq_tag_set tag_set; + struct gendisk *disk; + + struct workqueue_struct *workqueue; + bool buffered_io; + + const char *base_dir; + struct file *data_dir; + + unsigned int zone_shift; + sector_t zone_size; + sector_t zone_capacity; + unsigned int nr_zones; + unsigned int nr_conv_zones; + unsigned int block_size; + + struct zloop_zone zones[] __counted_by(nr_zones); +}; + +struct zloop_cmd { + struct work_struct work; + atomic_t ref; + sector_t sector; + sector_t nr_sectors; + long ret; + struct kiocb iocb; + struct bio_vec *bvec; +}; + +static DEFINE_IDR(zloop_index_idr); +static DEFINE_MUTEX(zloop_ctl_mutex); + +static unsigned int rq_zone_no(struct request *rq) +{ + struct zloop_device *zlo = rq->q->queuedata; + + return blk_rq_pos(rq) >> zlo->zone_shift; +} + +static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + struct kstat stat; + sector_t file_sectors; + int ret; + + lockdep_assert_held(&zone->lock); + + ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); + if (ret < 0) { + pr_err("Failed to get zone %u file stat (err=%d)\n", + zone_no, ret); + set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + return ret; + } + + file_sectors = stat.size >> SECTOR_SHIFT; + if (file_sectors > zlo->zone_capacity) { + pr_err("Zone %u file too large (%llu sectors > %llu)\n", + zone_no, file_sectors, zlo->zone_capacity); + return -EINVAL; + } + + if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { + pr_err("Zone %u file size not aligned to block size %u\n", + zone_no, zlo->block_size); + return -EINVAL; + } + + if (!file_sectors) { + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + } else if (file_sectors == zlo->zone_capacity) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zlo->zone_size; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + zone->wp = zone->start + file_sectors; + } + + return 0; +} + +static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + int ret = 0; + + if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) + return -EIO; + + mutex_lock(&zone->lock); + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + ret = zloop_update_seq_zone(zlo, zone_no); + if (ret) + goto unlock; + } + + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_IMP_OPEN: + zone->cond = BLK_ZONE_COND_EXP_OPEN; + break; + case BLK_ZONE_COND_FULL: + default: + ret = -EIO; + break; + } + +unlock: + mutex_unlock(&zone->lock); + + return ret; +} + +static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + int ret = 0; + + if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) + return -EIO; + + mutex_lock(&zone->lock); + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + ret = zloop_update_seq_zone(zlo, zone_no); + if (ret) + goto unlock; + } + + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + ret = -EIO; + break; + } + +unlock: + mutex_unlock(&zone->lock); + + return ret; +} + +static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + int ret = 0; + + if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) + return -EIO; + + mutex_lock(&zone->lock); + + if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && + zone->cond == BLK_ZONE_COND_EMPTY) + goto unlock; + + if (vfs_truncate(&zone->file->f_path, 0)) { + set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + ret = -EIO; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + +unlock: + mutex_unlock(&zone->lock); + + return ret; +} + +static int zloop_reset_all_zones(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) { + ret = zloop_reset_zone(zlo, i); + if (ret) + return ret; + } + + return 0; +} + +static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + int ret = 0; + + if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) + return -EIO; + + mutex_lock(&zone->lock); + + if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && + zone->cond == BLK_ZONE_COND_FULL) + goto unlock; + + if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) { + set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + ret = -EIO; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zlo->zone_size; + clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + + unlock: + mutex_unlock(&zone->lock); + + return ret; +} + +static void zloop_put_cmd(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + + if (!atomic_dec_and_test(&cmd->ref)) + return; + kfree(cmd->bvec); + cmd->bvec = NULL; + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); +} + +static void zloop_rw_complete(struct kiocb *iocb, long ret) +{ + struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb); + + cmd->ret = ret; + zloop_put_cmd(cmd); +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t sector = blk_rq_pos(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct req_iterator rq_iter; + struct zloop_zone *zone; + struct iov_iter iter; + struct bio_vec tmp; + sector_t zone_end; + int nr_bvec = 0; + int ret; + + atomic_set(&cmd->ref, 2); + cmd->sector = sector; + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { + ret = -EIO; + goto out; + } + zone = &zlo->zones[zone_no]; + zone_end = zone->start + zlo->zone_capacity; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { + ret = -EIO; + goto out; + } + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); + mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + + if (is_append) { + sector = zone->wp; + cmd->sector = sector; + } + + /* + * Write operations must be aligned to the write pointer and + * fully contained within the zone capacity. + */ + if (sector != zone->wp || zone->wp + nr_sectors > zone_end) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, sector, zone->wp); + ret = -EIO; + goto unlock; + } + + /* Implicitly open the target zone. */ + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Advance the write pointer of sequential zones. If the write + * fails, the wp position will be corrected when the next I/O + * copmpletes. + */ + zone->wp += nr_sectors; + if (zone->wp == zone_end) + zone->cond = BLK_ZONE_COND_FULL; + } + + rq_for_each_bvec(tmp, rq, rq_iter) + nr_bvec++; + + if (rq->bio != rq->biotail) { + struct bio_vec *bvec; + + cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO); + if (!cmd->bvec) { + ret = -EIO; + goto unlock; + } + + /* + * The bios of the request may be started from the middle of + * the 'bvec' because of bio splitting, so we can't directly + * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec + * API will take care of all details for us. + */ + bvec = cmd->bvec; + rq_for_each_bvec(tmp, rq, rq_iter) { + *bvec = tmp; + bvec++; + } + iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); + } else { + /* + * Same here, this bio may be started from the middle of the + * 'bvec' because of bio splitting, so offset from the bvec + * must be passed to iov iterator + */ + iov_iter_bvec(&iter, rw, + __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter), + nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; + } + + cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_filp = zone->file; + cmd->iocb.ki_complete = zloop_rw_complete; + if (!zlo->buffered_io) + cmd->iocb.ki_flags = IOCB_DIRECT; + cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + + if (rw == ITER_SOURCE) + ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); + else + ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); +unlock: + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + mutex_unlock(&zone->lock); +out: + if (ret != -EIOCBQUEUED) + zloop_rw_complete(&cmd->iocb, ret); + zloop_put_cmd(cmd); +} + +static void zloop_handle_cmd(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + + switch (req_op(rq)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: + /* + * zloop_rw() always executes asynchronously or completes + * directly. + */ + zloop_rw(cmd); + return; + case REQ_OP_FLUSH: + /* + * Sync the entire FS containing the zone files instead of + * walking all files + */ + cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb); + break; + case REQ_OP_ZONE_RESET: + cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq)); + break; + case REQ_OP_ZONE_RESET_ALL: + cmd->ret = zloop_reset_all_zones(zlo); + break; + case REQ_OP_ZONE_FINISH: + cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq)); + break; + case REQ_OP_ZONE_OPEN: + cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq)); + break; + case REQ_OP_ZONE_CLOSE: + cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq)); + break; + default: + WARN_ON_ONCE(1); + pr_err("Unsupported operation %d\n", req_op(rq)); + cmd->ret = -EOPNOTSUPP; + break; + } + + blk_mq_complete_request(rq); +} + +static void zloop_cmd_workfn(struct work_struct *work) +{ + struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work); + int orig_flags = current->flags; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + zloop_handle_cmd(cmd); + current->flags = orig_flags; +} + +static void zloop_complete_rq(struct request *rq) +{ + struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = cmd->sector >> zlo->zone_shift; + struct zloop_zone *zone = &zlo->zones[zone_no]; + blk_status_t sts = BLK_STS_OK; + + switch (req_op(rq)) { + case REQ_OP_READ: + if (cmd->ret < 0) + pr_err("Zone %u: failed read sector %llu, %llu sectors\n", + zone_no, cmd->sector, cmd->nr_sectors); + + if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { + /* short read */ + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); + } + break; + case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: + if (cmd->ret < 0) + pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n", + zone_no, + req_op(rq) == REQ_OP_WRITE ? "" : "append ", + cmd->sector, cmd->nr_sectors); + + if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { + pr_err("Zone %u: partial write %ld/%u B\n", + zone_no, cmd->ret, blk_rq_bytes(rq)); + cmd->ret = -EIO; + } + + if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { + /* + * A write to a sequential zone file failed: mark the + * zone as having an error. This will be corrected and + * cleared when the next IO is submitted. + */ + set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + break; + } + if (req_op(rq) == REQ_OP_ZONE_APPEND) + rq->__sector = cmd->sector; + + break; + default: + break; + } + + if (cmd->ret < 0) + sts = errno_to_blk_status(cmd->ret); + blk_mq_end_request(rq, sts); +} + +static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct zloop_device *zlo = rq->q->queuedata; + + if (zlo->state == Zlo_deleting) + return BLK_STS_IOERR; + + blk_mq_start_request(rq); + + INIT_WORK(&cmd->work, zloop_cmd_workfn); + queue_work(zlo->workqueue, &cmd->work); + + return BLK_STS_OK; +} + +static const struct blk_mq_ops zloop_mq_ops = { + .queue_rq = zloop_queue_rq, + .complete = zloop_complete_rq, +}; + +static int zloop_open(struct gendisk *disk, blk_mode_t mode) +{ + struct zloop_device *zlo = disk->private_data; + int ret; + + ret = mutex_lock_killable(&zloop_ctl_mutex); + if (ret) + return ret; + + if (zlo->state != Zlo_live) + ret = -ENXIO; + mutex_unlock(&zloop_ctl_mutex); + return ret; +} + +static int zloop_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct zloop_device *zlo = disk->private_data; + struct blk_zone blkz = {}; + unsigned int first, i; + int ret; + + first = disk_zone_no(disk, sector); + if (first >= zlo->nr_zones) + return 0; + nr_zones = min(nr_zones, zlo->nr_zones - first); + + for (i = 0; i < nr_zones; i++) { + unsigned int zone_no = first + i; + struct zloop_zone *zone = &zlo->zones[zone_no]; + + mutex_lock(&zone->lock); + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + ret = zloop_update_seq_zone(zlo, zone_no); + if (ret) { + mutex_unlock(&zone->lock); + return ret; + } + } + + blkz.start = zone->start; + blkz.len = zlo->zone_size; + blkz.wp = zone->wp; + blkz.cond = zone->cond; + if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { + blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; + blkz.capacity = zlo->zone_size; + } else { + blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + blkz.capacity = zlo->zone_capacity; + } + + mutex_unlock(&zone->lock); + + ret = cb(&blkz, i, data); + if (ret) + return ret; + } + + return nr_zones; +} + +static void zloop_free_disk(struct gendisk *disk) +{ + struct zloop_device *zlo = disk->private_data; + unsigned int i; + + blk_mq_free_tag_set(&zlo->tag_set); + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + + mapping_set_gfp_mask(zone->file->f_mapping, + zone->old_gfp_mask); + fput(zone->file); + } + + fput(zlo->data_dir); + destroy_workqueue(zlo->workqueue); + kfree(zlo->base_dir); + kvfree(zlo); +} + +static const struct block_device_operations zloop_fops = { + .owner = THIS_MODULE, + .open = zloop_open, + .report_zones = zloop_report_zones, + .free_disk = zloop_free_disk, +}; + +__printf(3, 4) +static struct file *zloop_filp_open_fmt(int oflags, umode_t mode, + const char *fmt, ...) +{ + struct file *file; + va_list ap; + char *p; + + va_start(ap, fmt); + p = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!p) + return ERR_PTR(-ENOMEM); + file = filp_open(p, oflags, mode); + kfree(p); + return file; +} + +static int zloop_get_block_size(struct zloop_device *zlo, + struct zloop_zone *zone) +{ + struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * If the FS block size is lower than or equal to 4K, use that as the + * device block size. Otherwise, fallback to the FS direct IO alignment + * constraint if that is provided, and to the FS underlying device + * physical block size if the direct IO alignment is unknown. + */ + if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K) + zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize; + else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + zlo->block_size = st.dio_offset_align; + else if (sb_bdev) + zlo->block_size = bdev_physical_block_size(sb_bdev); + else + zlo->block_size = SECTOR_SIZE; + + if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { + pr_err("Zone capacity is not aligned to block size %u\n", + zlo->block_size); + return -EINVAL; + } + + return 0; +} + +static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, + unsigned int zone_no, bool restore) +{ + struct zloop_zone *zone = &zlo->zones[zone_no]; + int oflags = O_RDWR; + struct kstat stat; + sector_t file_sectors; + int ret; + + mutex_init(&zone->lock); + zone->start = (sector_t)zone_no << zlo->zone_shift; + + if (!restore) + oflags |= O_CREAT; + + if (!opts->buffered_io) + oflags |= O_DIRECT; + + if (zone_no < zlo->nr_conv_zones) { + /* Conventional zone file. */ + set_bit(ZLOOP_ZONE_CONV, &zone->flags); + zone->cond = BLK_ZONE_COND_NOT_WP; + zone->wp = U64_MAX; + + zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u", + zlo->base_dir, zlo->id, zone_no); + if (IS_ERR(zone->file)) { + pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)", + zone_no, zlo->base_dir, zlo->id, zone_no, + PTR_ERR(zone->file)); + return PTR_ERR(zone->file); + } + + if (!zlo->block_size) { + ret = zloop_get_block_size(zlo, zone); + if (ret) + return ret; + } + + ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); + if (ret < 0) { + pr_err("Failed to get zone %u file stat\n", zone_no); + return ret; + } + file_sectors = stat.size >> SECTOR_SHIFT; + + if (restore && file_sectors != zlo->zone_size) { + pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n", + zone_no, file_sectors, zlo->zone_capacity); + return ret; + } + + ret = vfs_truncate(&zone->file->f_path, + zlo->zone_size << SECTOR_SHIFT); + if (ret < 0) { + pr_err("Failed to truncate zone %u file (err=%d)\n", + zone_no, ret); + return ret; + } + + return 0; + } + + /* Sequential zone file. */ + zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u", + zlo->base_dir, zlo->id, zone_no); + if (IS_ERR(zone->file)) { + pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)", + zone_no, zlo->base_dir, zlo->id, zone_no, + PTR_ERR(zone->file)); + return PTR_ERR(zone->file); + } + + if (!zlo->block_size) { + ret = zloop_get_block_size(zlo, zone); + if (ret) + return ret; + } + + zloop_get_block_size(zlo, zone); + + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); + mutex_unlock(&zone->lock); + + return ret; +} + +static bool zloop_dev_exists(struct zloop_device *zlo) +{ + struct file *cnv, *seq; + bool exists; + + cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u", + zlo->base_dir, zlo->id, 0); + seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u", + zlo->base_dir, zlo->id, 0); + exists = !IS_ERR(cnv) || !IS_ERR(seq); + + if (!IS_ERR(cnv)) + fput(cnv); + if (!IS_ERR(seq)) + fput(seq); + + return exists; +} + +static int zloop_ctl_add(struct zloop_options *opts) +{ + struct queue_limits lim = { + .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, + .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT, + .chunk_sectors = opts->zone_size, + .features = BLK_FEAT_ZONED, + }; + unsigned int nr_zones, i, j; + struct zloop_device *zlo; + int ret = -EINVAL; + bool restore; + + __module_get(THIS_MODULE); + + nr_zones = opts->capacity >> ilog2(opts->zone_size); + if (opts->nr_conv_zones >= nr_zones) { + pr_err("Invalid number of conventional zones %u\n", + opts->nr_conv_zones); + goto out; + } + + zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL); + if (!zlo) { + ret = -ENOMEM; + goto out; + } + zlo->state = Zlo_creating; + + ret = mutex_lock_killable(&zloop_ctl_mutex); + if (ret) + goto out_free_dev; + + /* Allocate id, if @opts->id >= 0, we're requesting that specific id */ + if (opts->id >= 0) { + ret = idr_alloc(&zloop_index_idr, zlo, + opts->id, opts->id + 1, GFP_KERNEL); + if (ret == -ENOSPC) + ret = -EEXIST; + } else { + ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL); + } + mutex_unlock(&zloop_ctl_mutex); + if (ret < 0) + goto out_free_dev; + + zlo->id = ret; + zlo->zone_shift = ilog2(opts->zone_size); + zlo->zone_size = opts->zone_size; + if (opts->zone_capacity) + zlo->zone_capacity = opts->zone_capacity; + else + zlo->zone_capacity = zlo->zone_size; + zlo->nr_zones = nr_zones; + zlo->nr_conv_zones = opts->nr_conv_zones; + zlo->buffered_io = opts->buffered_io; + + zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, + opts->nr_queues * opts->queue_depth, zlo->id); + if (!zlo->workqueue) { + ret = -ENOMEM; + goto out_free_idr; + } + + if (opts->base_dir) + zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL); + else + zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL); + if (!zlo->base_dir) { + ret = -ENOMEM; + goto out_destroy_workqueue; + } + + zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u", + zlo->base_dir, zlo->id); + if (IS_ERR(zlo->data_dir)) { + ret = PTR_ERR(zlo->data_dir); + pr_warn("Failed to open directory %s/%u (err=%d)\n", + zlo->base_dir, zlo->id, ret); + goto out_free_base_dir; + } + + /* + * If we already have zone files, we are restoring a device created by a + * previous add operation. In this case, zloop_init_zone() will check + * that the zone files are consistent with the zone configuration given. + */ + restore = zloop_dev_exists(zlo); + for (i = 0; i < nr_zones; i++) { + ret = zloop_init_zone(zlo, opts, i, restore); + if (ret) + goto out_close_files; + } + + lim.physical_block_size = zlo->block_size; + lim.logical_block_size = zlo->block_size; + + zlo->tag_set.ops = &zloop_mq_ops; + zlo->tag_set.nr_hw_queues = opts->nr_queues; + zlo->tag_set.queue_depth = opts->queue_depth; + zlo->tag_set.numa_node = NUMA_NO_NODE; + zlo->tag_set.cmd_size = sizeof(struct zloop_cmd); + zlo->tag_set.driver_data = zlo; + + ret = blk_mq_alloc_tag_set(&zlo->tag_set); + if (ret) { + pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret); + goto out_close_files; + } + + zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo); + if (IS_ERR(zlo->disk)) { + pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret); + ret = PTR_ERR(zlo->disk); + goto out_cleanup_tags; + } + zlo->disk->flags = GENHD_FL_NO_PART; + zlo->disk->fops = &zloop_fops; + zlo->disk->private_data = zlo; + sprintf(zlo->disk->disk_name, "zloop%d", zlo->id); + set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones); + + ret = blk_revalidate_disk_zones(zlo->disk); + if (ret) + goto out_cleanup_disk; + + ret = add_disk(zlo->disk); + if (ret) { + pr_err("add_disk failed (err=%d)\n", ret); + goto out_cleanup_disk; + } + + mutex_lock(&zloop_ctl_mutex); + zlo->state = Zlo_live; + mutex_unlock(&zloop_ctl_mutex); + + pr_info("Added device %d: %u zones of %llu MB, %u B block size\n", + zlo->id, zlo->nr_zones, + ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, + zlo->block_size); + + return 0; + +out_cleanup_disk: + put_disk(zlo->disk); +out_cleanup_tags: + blk_mq_free_tag_set(&zlo->tag_set); +out_close_files: + for (j = 0; j < i; j++) { + struct zloop_zone *zone = &zlo->zones[j]; + + if (!IS_ERR_OR_NULL(zone->file)) + fput(zone->file); + } + fput(zlo->data_dir); +out_free_base_dir: + kfree(zlo->base_dir); +out_destroy_workqueue: + destroy_workqueue(zlo->workqueue); +out_free_idr: + mutex_lock(&zloop_ctl_mutex); + idr_remove(&zloop_index_idr, zlo->id); + mutex_unlock(&zloop_ctl_mutex); +out_free_dev: + kvfree(zlo); +out: + module_put(THIS_MODULE); + if (ret == -ENOENT) + ret = -EINVAL; + return ret; +} + +static int zloop_ctl_remove(struct zloop_options *opts) +{ + struct zloop_device *zlo; + int ret; + + if (!(opts->mask & ZLOOP_OPT_ID)) { + pr_err("No ID specified\n"); + return -EINVAL; + } + + ret = mutex_lock_killable(&zloop_ctl_mutex); + if (ret) + return ret; + + zlo = idr_find(&zloop_index_idr, opts->id); + if (!zlo || zlo->state == Zlo_creating) { + ret = -ENODEV; + } else if (zlo->state == Zlo_deleting) { + ret = -EINVAL; + } else { + idr_remove(&zloop_index_idr, zlo->id); + zlo->state = Zlo_deleting; + } + + mutex_unlock(&zloop_ctl_mutex); + if (ret) + return ret; + + del_gendisk(zlo->disk); + put_disk(zlo->disk); + + pr_info("Removed device %d\n", opts->id); + + module_put(THIS_MODULE); + + return 0; +} + +static int zloop_parse_options(struct zloop_options *opts, const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + unsigned int token; + int ret = 0; + + /* Set defaults. */ + opts->mask = 0; + opts->id = ZLOOP_DEF_ID; + opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; + opts->zone_size = ZLOOP_DEF_ZONE_SIZE; + opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; + opts->nr_queues = ZLOOP_DEF_NR_QUEUES; + opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; + opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; + + if (!buf) + return 0; + + /* Skip leading spaces before the options. */ + while (isspace(*buf)) + buf++; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + /* Parse the options, doing only some light invalid value checks. */ + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, zloop_opt_tokens, args); + opts->mask |= token; + switch (token) { + case ZLOOP_OPT_ID: + if (match_int(args, &opts->id)) { + ret = -EINVAL; + goto out; + } + break; + case ZLOOP_OPT_CAPACITY: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (!token) { + pr_err("Invalid capacity\n"); + ret = -EINVAL; + goto out; + } + opts->capacity = + ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; + break; + case ZLOOP_OPT_ZONE_SIZE: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB || + !is_power_of_2(token)) { + pr_err("Invalid zone size %u\n", token); + ret = -EINVAL; + goto out; + } + opts->zone_size = + ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; + break; + case ZLOOP_OPT_ZONE_CAPACITY: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (!token) { + pr_err("Invalid zone capacity\n"); + ret = -EINVAL; + goto out; + } + opts->zone_capacity = + ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; + break; + case ZLOOP_OPT_NR_CONV_ZONES: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + opts->nr_conv_zones = token; + break; + case ZLOOP_OPT_BASE_DIR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->base_dir); + opts->base_dir = p; + break; + case ZLOOP_OPT_NR_QUEUES: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (!token) { + pr_err("Invalid number of queues\n"); + ret = -EINVAL; + goto out; + } + opts->nr_queues = min(token, num_online_cpus()); + break; + case ZLOOP_OPT_QUEUE_DEPTH: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (!token) { + pr_err("Invalid queue depth\n"); + ret = -EINVAL; + goto out; + } + opts->queue_depth = token; + break; + case ZLOOP_OPT_BUFFERED_IO: + opts->buffered_io = true; + break; + case ZLOOP_OPT_ERR: + default: + pr_warn("unknown parameter or missing value '%s'\n", p); + ret = -EINVAL; + goto out; + } + } + + ret = -EINVAL; + if (opts->capacity <= opts->zone_size) { + pr_err("Invalid capacity\n"); + goto out; + } + + if (opts->zone_capacity > opts->zone_size) { + pr_err("Invalid zone capacity\n"); + goto out; + } + + ret = 0; +out: + kfree(options); + return ret; +} + +enum { + ZLOOP_CTL_ADD, + ZLOOP_CTL_REMOVE, +}; + +static struct zloop_ctl_op { + int code; + const char *name; +} zloop_ctl_ops[] = { + { ZLOOP_CTL_ADD, "add" }, + { ZLOOP_CTL_REMOVE, "remove" }, + { -1, NULL }, +}; + +static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *pos) +{ + struct zloop_options opts = { }; + struct zloop_ctl_op *op; + const char *buf, *opts_buf; + int i, ret; + + if (count > PAGE_SIZE) + return -ENOMEM; + + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) { + op = &zloop_ctl_ops[i]; + if (!op->name) { + pr_err("Invalid operation\n"); + ret = -EINVAL; + goto out; + } + if (!strncmp(buf, op->name, strlen(op->name))) + break; + } + + if (count <= strlen(op->name)) + opts_buf = NULL; + else + opts_buf = buf + strlen(op->name); + + ret = zloop_parse_options(&opts, opts_buf); + if (ret) { + pr_err("Failed to parse options\n"); + goto out; + } + + switch (op->code) { + case ZLOOP_CTL_ADD: + ret = zloop_ctl_add(&opts); + break; + case ZLOOP_CTL_REMOVE: + ret = zloop_ctl_remove(&opts); + break; + default: + pr_err("Invalid operation\n"); + ret = -EINVAL; + goto out; + } + +out: + kfree(opts.base_dir); + kfree(buf); + return ret ? ret : count; +} + +static int zloop_ctl_show(struct seq_file *seq_file, void *private) +{ + const struct match_token *tok; + int i; + + /* Add operation */ + seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name); + for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) { + tok = &zloop_opt_tokens[i]; + if (!tok->pattern) + break; + if (i) + seq_putc(seq_file, ','); + seq_puts(seq_file, tok->pattern); + } + seq_putc(seq_file, '\n'); + + /* Remove operation */ + seq_puts(seq_file, zloop_ctl_ops[1].name); + seq_puts(seq_file, " id=%d\n"); + + return 0; +} + +static int zloop_ctl_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return single_open(file, zloop_ctl_show, NULL); +} + +static int zloop_ctl_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static const struct file_operations zloop_ctl_fops = { + .owner = THIS_MODULE, + .open = zloop_ctl_open, + .release = zloop_ctl_release, + .write = zloop_ctl_write, + .read = seq_read, +}; + +static struct miscdevice zloop_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "zloop-control", + .fops = &zloop_ctl_fops, +}; + +static int __init zloop_init(void) +{ + int ret; + + ret = misc_register(&zloop_misc); + if (ret) { + pr_err("Failed to register misc device: %d\n", ret); + return ret; + } + pr_info("Module loaded\n"); + + return 0; +} + +static void __exit zloop_exit(void) +{ + misc_deregister(&zloop_misc); + idr_destroy(&zloop_index_idr); +} + +module_init(zloop_init); +module_exit(zloop_exit); + +MODULE_DESCRIPTION("Zoned loopback device"); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/zram/backend_deflate.c b/drivers/block/zram/backend_deflate.c index 0f7f252c12f4..b75016e0e654 100644 --- a/drivers/block/zram/backend_deflate.c +++ b/drivers/block/zram/backend_deflate.c @@ -8,7 +8,7 @@ #include "backend_deflate.h" /* Use the same value as crypto API */ -#define DEFLATE_DEF_WINBITS 11 +#define DEFLATE_DEF_WINBITS (-11) #define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL struct deflate_ctx { @@ -22,8 +22,10 @@ static void deflate_release_params(struct zcomp_params *params) static int deflate_setup_params(struct zcomp_params *params) { - if (params->level == ZCOMP_PARAM_NO_LEVEL) + if (params->level == ZCOMP_PARAM_NOT_SET) params->level = Z_DEFAULT_COMPRESSION; + if (params->deflate.winbits == ZCOMP_PARAM_NOT_SET) + params->deflate.winbits = DEFLATE_DEF_WINBITS; return 0; } @@ -57,13 +59,13 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx) return -ENOMEM; ctx->context = zctx; - sz = zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL); + sz = zlib_deflate_workspacesize(params->deflate.winbits, MAX_MEM_LEVEL); zctx->cctx.workspace = vzalloc(sz); if (!zctx->cctx.workspace) goto error; ret = zlib_deflateInit2(&zctx->cctx, params->level, Z_DEFLATED, - -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL, + params->deflate.winbits, DEFLATE_DEF_MEMLEVEL, Z_DEFAULT_STRATEGY); if (ret != Z_OK) goto error; @@ -73,7 +75,7 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx) if (!zctx->dctx.workspace) goto error; - ret = zlib_inflateInit2(&zctx->dctx, -DEFLATE_DEF_WINBITS); + ret = zlib_inflateInit2(&zctx->dctx, params->deflate.winbits); if (ret != Z_OK) goto error; diff --git a/drivers/block/zram/backend_lz4.c b/drivers/block/zram/backend_lz4.c index 847f3334eb38..daccd60857eb 100644 --- a/drivers/block/zram/backend_lz4.c +++ b/drivers/block/zram/backend_lz4.c @@ -18,7 +18,7 @@ static void lz4_release_params(struct zcomp_params *params) static int lz4_setup_params(struct zcomp_params *params) { - if (params->level == ZCOMP_PARAM_NO_LEVEL) + if (params->level == ZCOMP_PARAM_NOT_SET) params->level = LZ4_ACCELERATION_DEFAULT; return 0; diff --git a/drivers/block/zram/backend_lz4hc.c b/drivers/block/zram/backend_lz4hc.c index 5f37d5abcaeb..9e8a35dfa56d 100644 --- a/drivers/block/zram/backend_lz4hc.c +++ b/drivers/block/zram/backend_lz4hc.c @@ -18,7 +18,7 @@ static void lz4hc_release_params(struct zcomp_params *params) static int lz4hc_setup_params(struct zcomp_params *params) { - if (params->level == ZCOMP_PARAM_NO_LEVEL) + if (params->level == ZCOMP_PARAM_NOT_SET) params->level = LZ4HC_DEFAULT_CLEVEL; return 0; diff --git a/drivers/block/zram/backend_zstd.c b/drivers/block/zram/backend_zstd.c index 22c8067536f3..81defb98ed09 100644 --- a/drivers/block/zram/backend_zstd.c +++ b/drivers/block/zram/backend_zstd.c @@ -58,7 +58,7 @@ static int zstd_setup_params(struct zcomp_params *params) return -ENOMEM; params->drv_data = zp; - if (params->level == ZCOMP_PARAM_NO_LEVEL) + if (params->level == ZCOMP_PARAM_NOT_SET) params->level = zstd_default_clevel(); zp->cprm = zstd_get_params(params->level, PAGE_SIZE); diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index d26a58c67e95..b1bd1daa0060 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/cpuhotplug.h> #include <linux/vmalloc.h> +#include <linux/sysfs.h> #include "zcomp.h" @@ -89,23 +90,21 @@ bool zcomp_available_algorithm(const char *comp) } /* show available compressors */ -ssize_t zcomp_available_show(const char *comp, char *buf) +ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at) { - ssize_t sz = 0; int i; for (i = 0; i < ARRAY_SIZE(backends) - 1; i++) { if (!strcmp(comp, backends[i]->name)) { - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]->name); + at += sysfs_emit_at(buf, at, "[%s] ", + backends[i]->name); } else { - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]->name); + at += sysfs_emit_at(buf, at, "%s ", backends[i]->name); } } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); - return sz; + at += sysfs_emit_at(buf, at, "\n"); + return at; } struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 25339ed1e07e..eacfd3f7d61d 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -5,7 +5,11 @@ #include <linux/mutex.h> -#define ZCOMP_PARAM_NO_LEVEL INT_MIN +#define ZCOMP_PARAM_NOT_SET INT_MIN + +struct deflate_params { + s32 winbits; +}; /* * Immutable driver (backend) parameters. The driver may attach private @@ -17,6 +21,9 @@ struct zcomp_params { void *dict; size_t dict_sz; s32 level; + union { + struct deflate_params deflate; + }; void *drv_data; }; @@ -72,7 +79,7 @@ struct zcomp { int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node); int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); -ssize_t zcomp_available_show(const char *comp, char *buf); +ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at); bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fda7d8624889..8acad3cc6e6e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -373,7 +373,7 @@ static ssize_t initstate_show(struct device *dev, val = init_done(zram); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t disksize_show(struct device *dev, @@ -381,7 +381,7 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + return sysfs_emit(buf, "%llu\n", zram->disksize); } static ssize_t mem_limit_store(struct device *dev, @@ -532,7 +532,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev, spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t writeback_limit_store(struct device *dev, @@ -567,7 +567,7 @@ static ssize_t writeback_limit_show(struct device *dev, spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } static void reset_bdev(struct zram *zram) @@ -734,114 +734,19 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -#define PAGE_WB_SIG "page_index=" - -#define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK (1<<0) -#define IDLE_WRITEBACK (1<<1) -#define INCOMPRESSIBLE_WRITEBACK (1<<2) - -static int scan_slots_for_writeback(struct zram *zram, u32 mode, - unsigned long nr_pages, - unsigned long index, - struct zram_pp_ctl *ctl) -{ - for (; nr_pages != 0; index++, nr_pages--) { - bool ok = true; - - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME)) - goto next; - - if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; - if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - goto next; - - ok = place_pp_slot(zram, ctl, index); -next: - zram_slot_unlock(zram, index); - if (!ok) - break; - } - - return 0; -} - -static ssize_t writeback_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) { - struct zram *zram = dev_to_zram(dev); - unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - struct zram_pp_ctl *ctl = NULL; + unsigned long blk_idx = 0; + struct page *page = NULL; struct zram_pp_slot *pps; - unsigned long index = 0; - struct bio bio; struct bio_vec bio_vec; - struct page *page = NULL; - ssize_t ret = len; - int mode, err; - unsigned long blk_idx = 0; - - if (sysfs_streq(buf, "idle")) - mode = IDLE_WRITEBACK; - else if (sysfs_streq(buf, "huge")) - mode = HUGE_WRITEBACK; - else if (sysfs_streq(buf, "huge_idle")) - mode = IDLE_WRITEBACK | HUGE_WRITEBACK; - else if (sysfs_streq(buf, "incompressible")) - mode = INCOMPRESSIBLE_WRITEBACK; - else { - if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) - return -EINVAL; - - if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || - index >= nr_pages) - return -EINVAL; - - nr_pages = 1; - mode = PAGE_WRITEBACK; - } - - down_read(&zram->init_lock); - if (!init_done(zram)) { - ret = -EINVAL; - goto release_init_lock; - } - - /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) { - up_read(&zram->init_lock); - return -EAGAIN; - } - - if (!zram->backing_dev) { - ret = -ENODEV; - goto release_init_lock; - } + struct bio bio; + int ret = 0, err; + u32 index; page = alloc_page(GFP_KERNEL); - if (!page) { - ret = -ENOMEM; - goto release_init_lock; - } - - ctl = init_pp_ctl(); - if (!ctl) { - ret = -ENOMEM; - goto release_init_lock; - } - - scan_slots_for_writeback(zram, mode, nr_pages, index, ctl); + if (!page) + return -ENOMEM; while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); @@ -929,10 +834,215 @@ next: if (blk_idx) free_block_bdev(zram, blk_idx); - -release_init_lock: if (page) __free_page(page); + + return ret; +} + +#define PAGE_WRITEBACK 0 +#define HUGE_WRITEBACK (1 << 0) +#define IDLE_WRITEBACK (1 << 1) +#define INCOMPRESSIBLE_WRITEBACK (1 << 2) + +static int parse_page_index(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + int ret; + + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + *hi = *lo + 1; + return 0; +} + +static int parse_page_indexes(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + char *delim; + int ret; + + delim = strchr(val, '-'); + if (!delim) + return -EINVAL; + + *delim = 0x00; + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + + ret = kstrtoul(delim + 1, 10, hi); + if (ret) + return ret; + if (*hi >= nr_pages || *lo > *hi) + return -ERANGE; + *hi += 1; + return 0; +} + +static int parse_mode(char *val, u32 *mode) +{ + *mode = 0; + + if (!strcmp(val, "idle")) + *mode = IDLE_WRITEBACK; + if (!strcmp(val, "huge")) + *mode = HUGE_WRITEBACK; + if (!strcmp(val, "huge_idle")) + *mode = IDLE_WRITEBACK | HUGE_WRITEBACK; + if (!strcmp(val, "incompressible")) + *mode = INCOMPRESSIBLE_WRITEBACK; + + if (*mode == 0) + return -EINVAL; + return 0; +} + +static int scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long lo, unsigned long hi, + struct zram_pp_ctl *ctl) +{ + u32 index = lo; + + while (index < hi) { + bool ok = true; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) + goto next; + + if (mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + if (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + ok = place_pp_slot(zram, ctl, index); +next: + zram_slot_unlock(zram, index); + if (!ok) + break; + index++; + } + + return 0; +} + +static ssize_t writeback_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u64 nr_pages = zram->disksize >> PAGE_SHIFT; + unsigned long lo = 0, hi = nr_pages; + struct zram_pp_ctl *ctl = NULL; + char *args, *param, *val; + ssize_t ret = len; + int err, mode = 0; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + + if (!zram->backing_dev) { + ret = -ENODEV; + goto release_init_lock; + } + + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + /* + * Workaround to support the old writeback interface. + * + * The old writeback interface has a minor inconsistency and + * requires key=value only for page_index parameter, while the + * writeback mode is a valueless parameter. + * + * This is not the case anymore and now all parameters are + * required to have values, however, we need to support the + * legacy writeback interface format so we check if we can + * recognize a valueless parameter as the (legacy) writeback + * mode. + */ + if (!val || !*val) { + err = parse_mode(param, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "type")) { + err = parse_mode(val, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "page_index")) { + err = parse_page_index(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + + if (!strcmp(param, "page_indexes")) { + err = parse_page_indexes(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + } + + err = zram_writeback_slots(zram, ctl); + if (err) + ret = err; + +release_init_lock: release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -1115,12 +1225,13 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) zram->comp_algs[prio] = alg; } -static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf) +static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, + char *buf, ssize_t at) { ssize_t sz; down_read(&zram->init_lock); - sz = zcomp_available_show(zram->comp_algs[prio], buf); + sz = zcomp_available_show(zram->comp_algs[prio], buf, at); up_read(&zram->init_lock); return sz; @@ -1166,13 +1277,15 @@ static void comp_params_reset(struct zram *zram, u32 prio) struct zcomp_params *params = &zram->params[prio]; vfree(params->dict); - params->level = ZCOMP_PARAM_NO_LEVEL; + params->level = ZCOMP_PARAM_NOT_SET; + params->deflate.winbits = ZCOMP_PARAM_NOT_SET; params->dict_sz = 0; params->dict = NULL; } static int comp_params_store(struct zram *zram, u32 prio, s32 level, - const char *dict_path) + const char *dict_path, + struct deflate_params *deflate_params) { ssize_t sz = 0; @@ -1190,6 +1303,7 @@ static int comp_params_store(struct zram *zram, u32 prio, s32 level, zram->params[prio].dict_sz = sz; zram->params[prio].level = level; + zram->params[prio].deflate.winbits = deflate_params->winbits; return 0; } @@ -1198,11 +1312,14 @@ static ssize_t algorithm_params_store(struct device *dev, const char *buf, size_t len) { - s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NO_LEVEL; + s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NOT_SET; char *args, *param, *val, *algo = NULL, *dict_path = NULL; + struct deflate_params deflate_params; struct zram *zram = dev_to_zram(dev); int ret; + deflate_params.winbits = ZCOMP_PARAM_NOT_SET; + args = skip_spaces(buf); while (*args) { args = next_arg(args, ¶m, &val); @@ -1233,6 +1350,13 @@ static ssize_t algorithm_params_store(struct device *dev, dict_path = val; continue; } + + if (!strcmp(param, "deflate.winbits")) { + ret = kstrtoint(val, 10, &deflate_params.winbits); + if (ret) + return ret; + continue; + } } /* Lookup priority by algorithm name */ @@ -1254,7 +1378,7 @@ static ssize_t algorithm_params_store(struct device *dev, if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS) return -EINVAL; - ret = comp_params_store(zram, prio, level, dict_path); + ret = comp_params_store(zram, prio, level, dict_path, &deflate_params); return ret ? ret : len; } @@ -1264,7 +1388,7 @@ static ssize_t comp_algorithm_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf); + return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0); } static ssize_t comp_algorithm_store(struct device *dev, @@ -1292,8 +1416,8 @@ static ssize_t recomp_algorithm_show(struct device *dev, if (!zram->comp_algs[prio]) continue; - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio); - sz += __comp_algorithm_show(zram, prio, buf + sz); + sz += sysfs_emit_at(buf, sz, "#%d: ", prio); + sz += __comp_algorithm_show(zram, prio, buf, sz); } return sz; @@ -1365,7 +1489,7 @@ static ssize_t io_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "%8llu %8llu 0 %8llu\n", (u64)atomic64_read(&zram->stats.failed_reads), (u64)atomic64_read(&zram->stats.failed_writes), @@ -1395,7 +1519,7 @@ static ssize_t mm_stat_show(struct device *dev, orig_size = atomic64_read(&zram->stats.pages_stored); max_used = atomic_long_read(&zram->stats.max_used_pages); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), @@ -1420,8 +1544,8 @@ static ssize_t bd_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu\n", + ret = sysfs_emit(buf, + "%8llu %8llu %8llu\n", FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); @@ -1439,7 +1563,7 @@ static ssize_t debug_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "version: %d\n0 %8llu\n", version, (u64)atomic64_read(&zram->stats.miss_free)); @@ -1694,7 +1818,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, */ handle = zs_malloc(zram->mem_pool, PAGE_SIZE, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); @@ -1761,7 +1885,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle); @@ -1981,10 +2105,15 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * We are holding per-CPU stream mutex and entry lock so better * avoid direct reclaim. Allocation error is not fatal since * we still have the old object in the mem_pool. + * + * XXX: technically, the node we really want here is the node that holds + * the original compressed data. But that would require us to modify + * zsmalloc API to return this information. For now, we will make do with + * the node of the page allocated for recompression. */ handle_new = zs_malloc(zram->mem_pool, comp_len_new, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle_new)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); @@ -2682,7 +2811,7 @@ static ssize_t hot_add_show(const struct class *class, if (ret < 0) return ret; - return scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return sysfs_emit(buf, "%d\n", ret); } /* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */ static struct class_attribute class_attr_hot_add = |