summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig62
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/aoe/aoe.h1
-rw-r--r--drivers/block/aoe/aoecmd.c10
-rw-r--r--drivers/block/aoe/aoedev.c15
-rw-r--r--drivers/block/brd.c229
-rw-r--r--drivers/block/drbd/drbd_int.h39
-rw-r--r--drivers/block/drbd/drbd_main.c62
-rw-r--r--drivers/block/drbd/drbd_receiver.c270
-rw-r--r--drivers/block/drbd/drbd_req.c3
-rw-r--r--drivers/block/drbd/drbd_worker.c62
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c56
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c34
-rw-r--r--drivers/block/nbd.c18
-rw-r--r--drivers/block/null_blk/main.c2
-rw-r--r--drivers/block/pktcdvd.c2916
-rw-r--r--drivers/block/rnbd/rnbd-srv.c7
-rw-r--r--drivers/block/sunvdc.c4
-rw-r--r--drivers/block/swim3.c8
-rw-r--r--drivers/block/ublk_drv.c1071
-rw-r--r--drivers/block/virtio_blk.c9
-rw-r--r--drivers/block/zloop.c1386
-rw-r--r--drivers/block/zram/backend_deflate.c12
-rw-r--r--drivers/block/zram/backend_lz4.c2
-rw-r--r--drivers/block/zram/backend_lz4hc.c2
-rw-r--r--drivers/block/zram/backend_zstd.c2
-rw-r--r--drivers/block/zram/zcomp.c15
-rw-r--r--drivers/block/zram/zcomp.h11
-rw-r--r--drivers/block/zram/zram_drv.c383
30 files changed, 2697 insertions, 3998 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e48b24be45ee..df38fb364904 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -256,49 +256,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config CDROM_PKTCDVD
- tristate "Packet writing on CD/DVD media (DEPRECATED)"
- depends on !UML
- depends on SCSI
- select CDROM
- help
- Note: This driver is deprecated and will be removed from the
- kernel in the near future!
-
- If you have a CDROM/DVD drive that supports packet writing, say
- Y to include support. It should work with any MMC/Mt Fuji
- compliant ATAPI or SCSI drive, which is just about any newer
- DVD/CD writer.
-
- Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
- is possible.
- DVD-RW disks must be in restricted overwrite mode.
-
- See the file <file:Documentation/cdrom/packet-writing.rst>
- for further information on the use of this driver.
-
- To compile this driver as a module, choose M here: the
- module will be called pktcdvd.
-
-config CDROM_PKTCDVD_BUFFERS
- int "Free buffers for data gathering"
- depends on CDROM_PKTCDVD
- default "8"
- help
- This controls the maximum number of active concurrent packets. More
- concurrent packets can increase write performance, but also require
- more memory. Each concurrent packet will require approximately 64Kb
- of non-swappable kernel memory, memory which will be allocated when
- a disc is opened for writing.
-
-config CDROM_PKTCDVD_WCACHE
- bool "Enable write caching"
- depends on CDROM_PKTCDVD
- help
- If enabled, write caching will be set for the CD-R/W device. For now
- this option is dangerous unless the CD-RW media is known good, as we
- don't do deferred write error handling yet.
-
config ATA_OVER_ETH
tristate "ATA over Ethernet support"
depends on NET
@@ -407,4 +364,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
source "drivers/block/rnbd/Kconfig"
+config BLK_DEV_ZONED_LOOP
+ tristate "Zoned loopback device support"
+ depends on BLK_DEV_ZONED
+ help
+ Saying Y here will allow you to use create a zoned block device using
+ regular files for zones (one file per zones). This is useful to test
+ file systems, device mapper and applications that support zoned block
+ devices. To create a zoned loop device, no user utility is needed, a
+ zoned loop device can be created (or re-started) using a command
+ like:
+
+ echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
+ /dev/zloop-control
+
+ See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
+ details.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1105a2d4fdcb..a695ce74ef22 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_N64CART) += n64cart.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
-obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
@@ -41,5 +40,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 749ae1246f4c..d35caa3c69e1 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -80,6 +80,7 @@ enum {
DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */
DEVFL_FREED = (1<<8), /* device has been cleaned up */
+ DEVFL_DEAD = (1<<9), /* device has timed out of aoe_deadsecs */
};
enum {
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 92b06d1de4cc..6298f8e271e3 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -745,7 +745,7 @@ rexmit_timer(struct timer_list *timer)
int utgts; /* number of aoetgt descriptors (not slots) */
int since;
- d = from_timer(d, timer, timer);
+ d = timer_container_of(d, timer, timer);
spin_lock_irqsave(&d->lock, flags);
@@ -754,7 +754,7 @@ rexmit_timer(struct timer_list *timer)
utgts = count_targets(d, NULL);
- if (d->flags & DEVFL_TKILL) {
+ if (d->flags & (DEVFL_TKILL | DEVFL_DEAD)) {
spin_unlock_irqrestore(&d->lock, flags);
return;
}
@@ -786,7 +786,8 @@ rexmit_timer(struct timer_list *timer)
* to clean up.
*/
list_splice(&flist, &d->factive[0]);
- aoedev_downdev(d);
+ d->flags |= DEVFL_DEAD;
+ queue_work(aoe_wq, &d->work);
goto out;
}
@@ -898,6 +899,9 @@ aoecmd_sleepwork(struct work_struct *work)
{
struct aoedev *d = container_of(work, struct aoedev, work);
+ if (d->flags & DEVFL_DEAD)
+ aoedev_downdev(d);
+
if (d->flags & DEVFL_GDALLOC)
aoeblk_gdalloc(d);
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 141b2a0e03f2..3a240755045b 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -149,7 +149,7 @@ dummy_timer(struct timer_list *t)
{
struct aoedev *d;
- d = from_timer(d, t, timer);
+ d = timer_container_of(d, t, timer);
if (d->flags & DEVFL_TKILL)
return;
d->timer.expires = jiffies + HZ;
@@ -198,9 +198,13 @@ aoedev_downdev(struct aoedev *d)
{
struct aoetgt *t, **tt, **te;
struct list_head *head, *pos, *nx;
+ struct request *rq, *rqnext;
int i;
+ unsigned long flags;
- d->flags &= ~DEVFL_UP;
+ spin_lock_irqsave(&d->lock, flags);
+ d->flags &= ~(DEVFL_UP | DEVFL_DEAD);
+ spin_unlock_irqrestore(&d->lock, flags);
/* clean out active and to-be-retransmitted buffers */
for (i = 0; i < NFACTIVE; i++) {
@@ -223,6 +227,13 @@ aoedev_downdev(struct aoedev *d)
/* clean out the in-process request (if any) */
aoe_failip(d);
+ /* clean out any queued block requests */
+ list_for_each_entry_safe(rq, rqnext, &d->rq_list, queuelist) {
+ list_del_init(&rq->queuelist);
+ blk_mq_start_request(rq);
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ }
+
/* fast fail all pending I/O */
if (d->blkq) {
/* UP is cleared, freeze+quiesce to insure all are errored */
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 292f127cae0a..0c2eabe14af3 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -54,32 +54,35 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
+ blk_opf_t opf)
+ __releases(rcu)
+ __acquires(rcu)
{
- pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
- struct page *page;
- int ret = 0;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- return 0;
+ gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
+ struct page *page, *ret;
+ rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
- if (!page)
- return -ENOMEM;
+ if (!page) {
+ rcu_read_lock();
+ return ERR_PTR(-ENOMEM);
+ }
xa_lock(&brd->brd_pages);
- ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
- if (!ret)
- brd->brd_nr_pages++;
- xa_unlock(&brd->brd_pages);
-
- if (ret < 0) {
+ ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
+ page, gfp);
+ rcu_read_lock();
+ if (ret) {
+ xa_unlock(&brd->brd_pages);
__free_page(page);
- if (ret == -EBUSY)
- ret = 0;
+ if (xa_is_err(ret))
+ return ERR_PTR(xa_err(ret));
+ return ret;
}
- return ret;
+ brd->brd_nr_pages++;
+ xa_unlock(&brd->brd_pages);
+ return page;
}
/*
@@ -100,143 +103,77 @@ static void brd_free_pages(struct brd_device *brd)
}
/*
- * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ * Process a single segment. The segment is capped to not cross page boundaries
+ * in both the bio and the brd backing memory.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
- gfp_t gfp)
-{
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
- int ret;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector, gfp);
- if (ret)
- return ret;
- if (copy < n) {
- sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector, gfp);
- }
- return ret;
-}
-
-/*
- * Copy n bytes from src to the brd starting at sector. Does not sleep.
- */
-static void copy_to_brd(struct brd_device *brd, const void *src,
- sector_t sector, size_t n)
+static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
{
+ struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+ sector_t sector = bio->bi_iter.bi_sector;
+ u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+ blk_opf_t opf = bio->bi_opf;
struct page *page;
- void *dst;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
+ void *kaddr;
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst + offset, src, copy);
- kunmap_atomic(dst);
-
- if (copy < n) {
- src += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(dst);
- }
-}
-
-/*
- * Copy n bytes to dst from the brd starting at sector. Does not sleep.
- */
-static void copy_from_brd(void *dst, struct brd_device *brd,
- sector_t sector, size_t n)
-{
- struct page *page;
- void *src;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
- copy = min_t(size_t, n, PAGE_SIZE - offset);
+ rcu_read_lock();
page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src + offset, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
-
- if (copy < n) {
- dst += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
+ if (!page && op_is_write(opf)) {
+ page = brd_insert_page(brd, sector, opf);
+ if (IS_ERR(page))
+ goto out_error;
}
-}
-
-/*
- * Process a single bvec of a bio.
- */
-static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, blk_opf_t opf,
- sector_t sector)
-{
- void *mem;
- int err = 0;
+ kaddr = bvec_kmap_local(&bv);
if (op_is_write(opf)) {
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
-
- err = copy_to_brd_setup(brd, sector, len, gfp);
- if (err)
- goto out;
- }
-
- mem = kmap_atomic(page);
- if (!op_is_write(opf)) {
- copy_from_brd(mem + off, brd, sector, len);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, kaddr, bv.bv_len);
} else {
- flush_dcache_page(page);
- copy_to_brd(brd, mem + off, sector, len);
+ if (page)
+ memcpy_from_page(kaddr, page, offset, bv.bv_len);
+ else
+ memset(kaddr, 0, bv.bv_len);
}
- kunmap_atomic(mem);
+ kunmap_local(kaddr);
+ rcu_read_unlock();
+
+ bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ return true;
+
+out_error:
+ rcu_read_unlock();
+ if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ return false;
+}
-out:
- return err;
+static void brd_free_one_page(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+
+ __free_page(page);
}
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
- sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
+ sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
+ sector_t aligned_end = round_down(
+ sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
struct page *page;
- size -= (aligned_sector - sector) * SECTOR_SIZE;
+ if (aligned_end <= aligned_sector)
+ return;
+
xa_lock(&brd->brd_pages);
- while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
+ while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- __free_page(page);
+ call_rcu(&page->rcu_head, brd_free_one_page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
- size -= PAGE_SIZE;
}
xa_unlock(&brd->brd_pages);
}
@@ -244,36 +181,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
- sector_t sector = bio->bi_iter.bi_sector;
- struct bio_vec bvec;
- struct bvec_iter iter;
if (unlikely(op_is_discard(bio->bi_opf))) {
- brd_do_discard(brd, sector, bio->bi_iter.bi_size);
+ brd_do_discard(brd, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size);
bio_endio(bio);
return;
}
- bio_for_each_segment(bvec, bio, iter) {
- unsigned int len = bvec.bv_len;
- int err;
-
- /* Don't support un-aligned buffer */
- WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
- (len & (SECTOR_SIZE - 1)));
-
- err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio->bi_opf, sector);
- if (err) {
- if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- bio_io_error(bio);
+ do {
+ if (!brd_rw_bvec(brd, bio))
return;
- }
- sector += len >> SECTOR_SHIFT;
- }
+ } while (bio->bi_iter.bi_size);
bio_endio(bio);
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e21492981f7d..f6d6276974ee 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -380,6 +380,9 @@ enum {
/* this is/was a write request */
__EE_WRITE,
+ /* hand back using mempool_free(e, drbd_buffer_page_pool) */
+ __EE_RELEASE_TO_MEMPOOL,
+
/* this is/was a write same request */
__EE_WRITE_SAME,
@@ -402,6 +405,7 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
+#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
@@ -858,7 +862,6 @@ struct drbd_device {
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* need to send P_WRITE_ACK */
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
- struct list_head net_ee; /* zero-copy network send in progress */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
@@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t drbd_request_mempool;
extern mempool_t drbd_ee_mempool;
-/* drbd's page pool, used to buffer data received from the peer,
- * or data requested by the peer.
- *
- * This does not have an emergency reserve.
- *
- * When allocating from this pool, it first takes pages from the pool.
- * Only if the pool is depleted will try to allocate from the system.
- *
- * The assumption is that pages taken from this pool will be processed,
- * and given back, "quickly", and then can be recycled, so we can avoid
- * frequent calls to alloc_page(), and still will be able to make progress even
- * under memory pressure.
- */
-extern struct page *drbd_pp_pool;
-extern spinlock_t drbd_pp_lock;
-extern int drbd_pp_vacant;
-extern wait_queue_head_t drbd_pp_wait;
-
/* We also need a standard (emergency-reserve backed) page pool
* for meta data IO (activity log, bitmap).
* We can keep it global, as long as it is used as "N pages at a time".
@@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait;
*/
#define DRBD_MIN_POOL_PAGES 128
extern mempool_t drbd_md_io_page_pool;
+extern mempool_t drbd_buffer_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */
@@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *,
sector_t, unsigned int,
unsigned int,
gfp_t) __must_hold(local);
-extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
- int);
-#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
-#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req);
extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
extern int drbd_connected(struct drbd_peer_device *);
@@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page)
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
-{
- struct page *page = peer_req->pages;
- page_chain_for_each(page) {
- if (page_count(page) > 1)
- return 1;
- }
- return 0;
-}
-
static inline union drbd_state drbd_read_state(struct drbd_device *device)
{
struct drbd_resource *resource = device->resource;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ced2cc5f46f2..c73376886e7a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t drbd_request_mempool;
mempool_t drbd_ee_mempool;
mempool_t drbd_md_io_page_pool;
+mempool_t drbd_buffer_page_pool;
struct bio_set drbd_md_io_bio_set;
struct bio_set drbd_io_bio_set;
-/* I do not use a standard mempool, because:
- 1) I want to hand out the pre-allocated objects first.
- 2) I want to be able to interrupt sleeping allocation with a signal.
- Note: This is a single linked list, the next pointer is the private
- member of struct page.
- */
-struct page *drbd_pp_pool;
-DEFINE_SPINLOCK(drbd_pp_lock);
-int drbd_pp_vacant;
-wait_queue_head_t drbd_pp_wait;
-
DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
static const struct block_device_operations drbd_ops = {
@@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
struct drbd_peer_request *peer_req)
{
+ bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL);
struct page *page = peer_req->pages;
unsigned len = peer_req->i.size;
int err;
@@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- err = _drbd_send_page(peer_device, page, 0, l,
- page_chain_next(page) ? MSG_MORE : 0);
+ if (likely(use_sendpage))
+ err = _drbd_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ else
+ err = _drbd_no_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+
if (err)
return err;
len -= l;
@@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device)
INIT_LIST_HEAD(&device->sync_ee);
INIT_LIST_HEAD(&device->done_ee);
INIT_LIST_HEAD(&device->read_ee);
- INIT_LIST_HEAD(&device->net_ee);
INIT_LIST_HEAD(&device->resync_reads);
INIT_LIST_HEAD(&device->resync_work.list);
INIT_LIST_HEAD(&device->unplug_work.list);
@@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device)
D_ASSERT(device, list_empty(&device->sync_ee));
D_ASSERT(device, list_empty(&device->done_ee));
D_ASSERT(device, list_empty(&device->read_ee));
- D_ASSERT(device, list_empty(&device->net_ee));
D_ASSERT(device, list_empty(&device->resync_reads));
D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
D_ASSERT(device, list_empty(&device->resync_work.list));
@@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device)
static void drbd_destroy_mempools(void)
{
- struct page *page;
-
- while (drbd_pp_pool) {
- page = drbd_pp_pool;
- drbd_pp_pool = (struct page *)page_private(page);
- __free_page(page);
- drbd_pp_vacant--;
- }
-
/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
bioset_exit(&drbd_io_bio_set);
bioset_exit(&drbd_md_io_bio_set);
+ mempool_exit(&drbd_buffer_page_pool);
mempool_exit(&drbd_md_io_page_pool);
mempool_exit(&drbd_ee_mempool);
mempool_exit(&drbd_request_mempool);
@@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void)
static int drbd_create_mempools(void)
{
- struct page *page;
const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
- int i, ret;
+ int ret;
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
+ ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0);
+ if (ret)
+ goto Enomem;
+
ret = mempool_init_slab_pool(&drbd_request_mempool, number,
drbd_request_cache);
if (ret)
@@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
- for (i = 0; i < number; i++) {
- page = alloc_page(GFP_HIGHUSER);
- if (!page)
- goto Enomem;
- set_page_private(page, (unsigned long)drbd_pp_pool);
- drbd_pp_pool = page;
- }
- drbd_pp_vacant = number;
-
return 0;
Enomem:
@@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device)
rr = drbd_free_peer_reqs(device, &device->done_ee);
if (rr)
drbd_err(device, "%d EEs in done list found!\n", rr);
-
- rr = drbd_free_peer_reqs(device, &device->net_ee);
- if (rr)
- drbd_err(device, "%d EEs in net list found!\n", rr);
}
/* caution. no locking. */
@@ -2863,11 +2839,6 @@ static int __init drbd_init(void)
return err;
}
- /*
- * allocate all necessary structs
- */
- init_waitqueue_head(&drbd_pp_wait);
-
drbd_proc = NULL; /* play safe for drbd_cleanup */
idr_init(&drbd_devices);
@@ -3591,7 +3562,8 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
static void md_sync_timer_fn(struct timer_list *t)
{
- struct drbd_device *device = from_timer(device, t, md_sync_timer);
+ struct drbd_device *device = timer_container_of(device, t,
+ md_sync_timer);
drbd_device_post_work(device, MD_SYNC);
}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index e5a2e5f7887b..caaf2781136d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
+#include <linux/mempool.h>
#include "drbd_int.h"
#include "drbd_protocol.h"
#include "drbd_req.h"
@@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
-/*
- * some helper functions to deal with single linked page lists,
- * page->private being our "next" pointer.
- */
-
-/* If at least n pages are linked at head, get n pages off.
- * Otherwise, don't modify head, and return NULL.
- * Locking is the responsibility of the caller.
- */
-static struct page *page_chain_del(struct page **head, int n)
-{
- struct page *page;
- struct page *tmp;
-
- BUG_ON(!n);
- BUG_ON(!head);
-
- page = *head;
-
- if (!page)
- return NULL;
-
- while (page) {
- tmp = page_chain_next(page);
- if (--n == 0)
- break; /* found sufficient pages */
- if (tmp == NULL)
- /* insufficient pages, don't use any of them. */
- return NULL;
- page = tmp;
- }
-
- /* add end of list marker for the returned list */
- set_page_private(page, 0);
- /* actual return value, and adjustment of head */
- page = *head;
- *head = tmp;
- return page;
-}
-
-/* may be used outside of locks to find the tail of a (usually short)
- * "private" page chain, before adding it back to a global chain head
- * with page_chain_add() under a spinlock. */
-static struct page *page_chain_tail(struct page *page, int *len)
-{
- struct page *tmp;
- int i = 1;
- while ((tmp = page_chain_next(page))) {
- ++i;
- page = tmp;
- }
- if (len)
- *len = i;
- return page;
-}
-
-static int page_chain_free(struct page *page)
-{
- struct page *tmp;
- int i = 0;
- page_chain_for_each_safe(page, tmp) {
- put_page(page);
- ++i;
- }
- return i;
-}
-
-static void page_chain_add(struct page **head,
- struct page *chain_first, struct page *chain_last)
-{
-#if 1
- struct page *tmp;
- tmp = page_chain_tail(chain_first, NULL);
- BUG_ON(tmp != chain_last);
-#endif
-
- /* add chain to head */
- set_page_private(chain_last, (unsigned long)*head);
- *head = chain_first;
-}
-
-static struct page *__drbd_alloc_pages(struct drbd_device *device,
- unsigned int number)
+static struct page *__drbd_alloc_pages(unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
unsigned int i = 0;
- /* Yes, testing drbd_pp_vacant outside the lock is racy.
- * So what. It saves a spin_lock. */
- if (drbd_pp_vacant >= number) {
- spin_lock(&drbd_pp_lock);
- page = page_chain_del(&drbd_pp_pool, number);
- if (page)
- drbd_pp_vacant -= number;
- spin_unlock(&drbd_pp_lock);
- if (page)
- return page;
- }
-
/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
for (i = 0; i < number; i++) {
- tmp = alloc_page(GFP_TRY);
+ tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY);
if (!tmp)
- break;
+ goto fail;
set_page_private(tmp, (unsigned long)page);
page = tmp;
}
-
- if (i == number)
- return page;
-
- /* Not enough pages immediately available this time.
- * No need to jump around here, drbd_alloc_pages will retry this
- * function "soon". */
- if (page) {
- tmp = page_chain_tail(page, NULL);
- spin_lock(&drbd_pp_lock);
- page_chain_add(&drbd_pp_pool, page, tmp);
- drbd_pp_vacant += i;
- spin_unlock(&drbd_pp_lock);
+ return page;
+fail:
+ page_chain_for_each_safe(page, tmp) {
+ set_page_private(page, 0);
+ mempool_free(page, &drbd_buffer_page_pool);
}
return NULL;
}
-static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
- struct list_head *to_be_freed)
-{
- struct drbd_peer_request *peer_req, *tmp;
-
- /* The EEs are always appended to the end of the list. Since
- they are sent in order over the wire, they have to finish
- in order. As soon as we see the first not finished we can
- stop to examine the list... */
-
- list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
- if (drbd_peer_req_has_active_page(peer_req))
- break;
- list_move(&peer_req->w.list, to_be_freed);
- }
-}
-
-static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
-{
- LIST_HEAD(reclaimed);
- struct drbd_peer_request *peer_req, *t;
-
- spin_lock_irq(&device->resource->req_lock);
- reclaim_finished_net_peer_reqs(device, &reclaimed);
- spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
- drbd_free_net_peer_req(device, peer_req);
-}
-
-static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
-{
- struct drbd_peer_device *peer_device;
- int vnr;
-
- rcu_read_lock();
- idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
- struct drbd_device *device = peer_device->device;
- if (!atomic_read(&device->pp_in_use_by_net))
- continue;
-
- kref_get(&device->kref);
- rcu_read_unlock();
- drbd_reclaim_net_peer_reqs(device);
- kref_put(&device->kref, drbd_destroy_device);
- rcu_read_lock();
- }
- rcu_read_unlock();
-}
-
/**
* drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @peer_device: DRBD device.
@@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
bool retry)
{
struct drbd_device *device = peer_device->device;
- struct page *page = NULL;
+ struct page *page;
struct net_conf *nc;
- DEFINE_WAIT(wait);
unsigned int mxb;
rcu_read_lock();
@@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
mxb = nc ? nc->max_buffers : 1000000;
rcu_read_unlock();
- if (atomic_read(&device->pp_in_use) < mxb)
- page = __drbd_alloc_pages(device, number);
-
- /* Try to keep the fast path fast, but occasionally we need
- * to reclaim the pages we lended to the network stack. */
- if (page && atomic_read(&device->pp_in_use_by_net) > 512)
- drbd_reclaim_net_peer_reqs(device);
-
- while (page == NULL) {
- prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
-
- drbd_reclaim_net_peer_reqs(device);
-
- if (atomic_read(&device->pp_in_use) < mxb) {
- page = __drbd_alloc_pages(device, number);
- if (page)
- break;
- }
-
- if (!retry)
- break;
-
- if (signal_pending(current)) {
- drbd_warn(device, "drbd_alloc_pages interrupted!\n");
- break;
- }
-
- if (schedule_timeout(HZ/10) == 0)
- mxb = UINT_MAX;
- }
- finish_wait(&drbd_pp_wait, &wait);
+ if (atomic_read(&device->pp_in_use) >= mxb)
+ schedule_timeout_interruptible(HZ / 10);
+ page = __drbd_alloc_pages(number);
if (page)
atomic_add(number, &device->pp_in_use);
@@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
* Is also used from inside an other spin_lock_irq(&resource->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_device *device, struct page *page)
{
- atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
- int i;
+ struct page *tmp;
+ int i = 0;
if (page == NULL)
return;
- if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
- i = page_chain_free(page);
- else {
- struct page *tmp;
- tmp = page_chain_tail(page, &i);
- spin_lock(&drbd_pp_lock);
- page_chain_add(&drbd_pp_pool, page, tmp);
- drbd_pp_vacant += i;
- spin_unlock(&drbd_pp_lock);
- }
- i = atomic_sub_return(i, a);
+ page_chain_for_each_safe(page, tmp) {
+ set_page_private(page, 0);
+ if (page_count(page) == 1)
+ mempool_free(page, &drbd_buffer_page_pool);
+ else
+ put_page(page);
+ i++;
+ }
+ i = atomic_sub_return(i, &device->pp_in_use);
if (i < 0)
- drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
- is_net ? "pp_in_use_by_net" : "pp_in_use", i);
- wake_up(&drbd_pp_wait);
+ drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
}
/*
@@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
gfpflags_allow_blocking(gfp_mask));
if (!page)
goto fail;
+ if (!mempool_is_saturated(&drbd_buffer_page_pool))
+ peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
}
memset(peer_req, 0, sizeof(*peer_req));
@@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
-void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
- int is_net)
+void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
might_sleep();
if (peer_req->flags & EE_HAS_DIGEST)
kfree(peer_req->digest);
- drbd_free_pages(device, peer_req->pages, is_net);
+ drbd_free_pages(device, peer_req->pages);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
@@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_peer_request *peer_req, *t;
int count = 0;
- int is_net = list == &device->net_ee;
spin_lock_irq(&device->resource->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
- __drbd_free_peer_req(device, peer_req, is_net);
+ drbd_free_peer_req(device, peer_req);
count++;
}
return count;
@@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
static int drbd_finish_peer_reqs(struct drbd_device *device)
{
LIST_HEAD(work_list);
- LIST_HEAD(reclaimed);
struct drbd_peer_request *peer_req, *t;
int err = 0;
spin_lock_irq(&device->resource->req_lock);
- reclaim_finished_net_peer_reqs(device, &reclaimed);
list_splice_init(&device->done_ee, &work_list);
spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
- drbd_free_net_peer_req(device, peer_req);
-
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
@@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
data_size -= len;
}
kunmap(page);
- drbd_free_pages(peer_device->device, page, 0);
+ drbd_free_pages(peer_device->device, page);
return err;
}
@@ -2500,7 +2312,11 @@ static int handle_write_conflicts(struct drbd_device *device,
peer_req->w.cb = superseded ? e_send_superseded :
e_send_retry_write;
list_add_tail(&peer_req->w.list, &device->done_ee);
- queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
+ /* put is in drbd_send_acks_wf() */
+ kref_get(&device->kref);
+ if (!queue_work(connection->ack_sender,
+ &peer_req->peer_device->send_acks_work))
+ kref_put(&device->kref, drbd_destroy_device);
err = -ENOENT;
goto out;
@@ -5220,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
put_ldev(device);
}
- /* tcp_close and release of sendpage pages can be deferred. I don't
- * want to use SO_LINGER, because apparently it can be deferred for
- * more than 20 seconds (longest time I checked).
- *
- * Actually we don't care for exactly when the network stack does its
- * put_page(), but release our reference on these pages right here.
- */
- i = drbd_free_peer_reqs(device, &device->net_ee);
- if (i)
- drbd_info(device, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&device->pp_in_use_by_net);
if (i)
drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
@@ -5976,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi)
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
- conn_reclaim_net_peer_reqs(connection);
-
if (test_and_clear_bit(SEND_PING, &connection->flags)) {
if (drbd_send_ping(connection)) {
drbd_err(connection, "drbd_send_ping has failed\n");
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 380e6584a4ee..d15826f6ee81 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1699,7 +1699,8 @@ static bool net_timeout_reached(struct drbd_request *net_req,
void request_timer_fn(struct timer_list *t)
{
- struct drbd_device *device = from_timer(device, t, request_timer);
+ struct drbd_device *device = timer_container_of(device, t,
+ request_timer);
struct drbd_connection *connection = first_peer_device(device)->connection;
struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
struct net_conf *nc;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4352a50fbb3f..dea3e79d044f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -442,7 +442,8 @@ int w_resync_timer(struct drbd_work *w, int cancel)
void resync_timer_fn(struct timer_list *t)
{
- struct drbd_device *device = from_timer(device, t, resync_timer);
+ struct drbd_device *device = timer_container_of(device, t,
+ resync_timer);
drbd_queue_work_if_unqueued(
&first_peer_device(device)->connection->sender_work,
@@ -1029,22 +1030,6 @@ out:
return 1;
}
-/* helper */
-static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
-{
- if (drbd_peer_req_has_active_page(peer_req)) {
- /* This might happen if sendpage() has not finished */
- int i = PFN_UP(peer_req->i.size);
- atomic_add(i, &device->pp_in_use_by_net);
- atomic_sub(i, &device->pp_in_use);
- spin_lock_irq(&device->resource->req_lock);
- list_add_tail(&peer_req->w.list, &device->net_ee);
- spin_unlock_irq(&device->resource->req_lock);
- wake_up(&drbd_pp_wait);
- } else
- drbd_free_peer_req(device, peer_req);
-}
-
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
* @w: work object.
@@ -1058,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
@@ -1073,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
}
- dec_unacked(device);
-
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
@@ -1119,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (get_ldev_if_state(device, D_FAILED)) {
@@ -1154,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
/* update resync data with failure */
drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size);
}
-
- dec_unacked(device);
-
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
@@ -1175,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
int err, eq = 0;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (get_ldev(device)) {
@@ -1219,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}
-
- dec_unacked(device);
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block/ack() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
@@ -1698,7 +1679,8 @@ void drbd_rs_controller_reset(struct drbd_peer_device *peer_device)
void start_resync_timer_fn(struct timer_list *t)
{
- struct drbd_device *device = from_timer(device, t, start_resync_timer);
+ struct drbd_device *device = timer_container_of(device, t,
+ start_resync_timer);
drbd_device_post_work(device, RS_START);
}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e97432032f01..24be0c2c4075 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3411,7 +3411,7 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode,
struct floppy_max_errors max_errors;
struct floppy_drive_params dp;
} inparam; /* parameters coming from user space */
- const void *outparam; /* parameters passed back to user space */
+ const void *outparam = NULL; /* parameters passed back to user space */
/* convert compatibility eject ioctls into floppy eject ioctl.
* We do this in order to provide a means to eject floppy disks before
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e2b1f377f585..1b6ee91f8eb9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -313,6 +313,8 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
return;
kfree(cmd->bvec);
cmd->bvec = NULL;
+ if (req_op(rq) == REQ_OP_WRITE)
+ kiocb_end_write(&cmd->iocb);
if (likely(!blk_should_fake_timeout(rq->q)))
blk_mq_complete_request(rq);
}
@@ -387,9 +389,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
cmd->iocb.ki_flags = 0;
}
- if (rw == ITER_SOURCE)
+ if (rw == ITER_SOURCE) {
+ kiocb_start_write(&cmd->iocb);
ret = file->f_op->write_iter(&cmd->iocb, &iter);
- else
+ } else
ret = file->f_op->read_iter(&cmd->iocb, &iter);
lo_rw_aio_do_completion(cmd);
@@ -1244,12 +1247,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS);
- if (size_changed) {
- loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
- lo->lo_backing_file);
- loop_set_size(lo, new_size);
- }
-
/* update the direct I/O flag if lo_offset changed */
loop_update_dio(lo);
@@ -1257,6 +1254,11 @@ out_unfreeze:
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
if (partscan)
clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
+ if (!err && size_changed) {
+ loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
+ lo->lo_backing_file);
+ loop_set_size(lo, new_size);
+ }
out_unlock:
mutex_unlock(&lo->lo_mutex);
if (partscan)
@@ -1429,17 +1431,34 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
return 0;
}
-static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode,
+ struct block_device *bdev, unsigned long arg)
{
struct queue_limits lim;
unsigned int memflags;
int err = 0;
- if (lo->lo_state != Lo_bound)
- return -ENXIO;
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * here to avoid changing device under exclusive owner.
+ */
+ if (!(mode & BLK_OPEN_EXCL)) {
+ err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL);
+ if (err)
+ return err;
+ }
+
+ err = mutex_lock_killable(&lo->lo_mutex);
+ if (err)
+ goto abort_claim;
+
+ if (lo->lo_state != Lo_bound) {
+ err = -ENXIO;
+ goto unlock;
+ }
if (lo->lo_queue->limits.logical_block_size == arg)
- return 0;
+ goto unlock;
sync_blockdev(lo->lo_device);
invalidate_bdev(lo->lo_device);
@@ -1452,6 +1471,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
+unlock:
+ mutex_unlock(&lo->lo_mutex);
+abort_claim:
+ if (!(mode & BLK_OPEN_EXCL))
+ bd_abort_claiming(bdev, loop_set_block_size);
return err;
}
@@ -1470,9 +1494,6 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
case LOOP_SET_DIRECT_IO:
err = loop_set_dio(lo, arg);
break;
- case LOOP_SET_BLOCK_SIZE:
- err = loop_set_block_size(lo, arg);
- break;
default:
err = -EINVAL;
}
@@ -1527,9 +1548,12 @@ static int lo_ioctl(struct block_device *bdev, blk_mode_t mode,
break;
case LOOP_GET_STATUS64:
return loop_get_status64(lo, argp);
+ case LOOP_SET_BLOCK_SIZE:
+ if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return loop_set_block_size(lo, mode, bdev, arg);
case LOOP_SET_CAPACITY:
case LOOP_SET_DIRECT_IO:
- case LOOP_SET_BLOCK_SIZE:
if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN))
return -EPERM;
fallthrough;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 0d619df03fa9..8fc7761397bd 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2040,11 +2040,12 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
* @dir Direction (read or write)
*
* return value
- * None
+ * 0 The IO completed successfully.
+ * -ENOMEM The DMA mapping failed.
*/
-static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
- struct mtip_cmd *command,
- struct blk_mq_hw_ctx *hctx)
+static int mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
+ struct mtip_cmd *command,
+ struct blk_mq_hw_ctx *hctx)
{
struct mtip_cmd_hdr *hdr =
dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
@@ -2056,12 +2057,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
unsigned int nents;
/* Map the scatter list for DMA access */
- nents = blk_rq_map_sg(rq, command->sg);
- nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+ command->scatter_ents = blk_rq_map_sg(rq, command->sg);
+ nents = dma_map_sg(&dd->pdev->dev, command->sg,
+ command->scatter_ents, dma_dir);
+ if (!nents)
+ return -ENOMEM;
- prefetch(&port->flags);
- command->scatter_ents = nents;
+ prefetch(&port->flags);
/*
* The number of retries for this command before it is
@@ -2112,11 +2115,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) {
set_bit(rq->tag, port->cmds_to_issue);
set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
- return;
+ return 0;
}
/* Issue the command to the hardware */
mtip_issue_ncq_command(port, rq->tag);
+
+ return 0;
}
/*
@@ -3315,7 +3320,9 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
- mtip_hw_submit_io(dd, rq, cmd, hctx);
+ if (mtip_hw_submit_io(dd, rq, cmd, hctx))
+ return BLK_STS_IOERR;
+
return BLK_STS_OK;
}
@@ -3717,7 +3724,7 @@ static int mtip_pci_probe(struct pci_dev *pdev,
rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (rv) {
dev_warn(&pdev->dev, "64-bit DMA enable failed\n");
- goto setmask_err;
+ goto iomap_err;
}
/* Copy the info we may need later into the private data structure. */
@@ -3733,7 +3740,7 @@ static int mtip_pci_probe(struct pci_dev *pdev,
if (!dd->isr_workq) {
dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
rv = -ENOMEM;
- goto setmask_err;
+ goto iomap_err;
}
memset(cpu_list, 0, sizeof(cpu_list));
@@ -3830,8 +3837,6 @@ msi_initialize_err:
drop_cpu(dd->work[1].cpu_binding);
drop_cpu(dd->work[2].cpu_binding);
}
-setmask_err:
- pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
iomap_err:
kfree(dd);
@@ -3907,7 +3912,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
pci_disable_msi(pdev);
- pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
pci_set_drvdata(pdev, NULL);
put_disk(dd->disk);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7bdc7eb808ea..6463d0e8d0ce 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1473,7 +1473,17 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
- blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
+retry:
+ mutex_unlock(&nbd->config_lock);
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections);
+ mutex_lock(&nbd->config_lock);
+
+ /* if another code path updated nr_hw_queues, retry until succeed */
+ if (num_connections != config->num_connections) {
+ num_connections = config->num_connections;
+ goto retry;
+ }
+
nbd->pid = task_pid_nr(current);
nbd_parse_flags(nbd);
@@ -2198,9 +2208,7 @@ again:
goto out;
}
}
- ret = nbd_start_device(nbd);
- if (ret)
- goto out;
+
if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
GFP_KERNEL);
@@ -2216,6 +2224,8 @@ again:
goto out;
}
set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
+
+ ret = nbd_start_device(nbd);
out:
mutex_unlock(&nbd->config_lock);
if (!ret) {
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index aa163ae9b2aa..91642c9a3b29 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1179,7 +1179,7 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
memcpy_page(dest, off + count, t_page->page, offset,
temp);
else
- zero_user(dest, off + count, temp);
+ memzero_page(dest, off + count, temp);
count += temp;
sector += temp >> SECTOR_SHIFT;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
deleted file mode 100644
index 65b96c083b3c..000000000000
--- a/drivers/block/pktcdvd.c
+++ /dev/null
@@ -1,2916 +0,0 @@
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- * Copyright (C) 2006 Thomas Maier <balagi@justmail.de>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License. See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and
- * DVD-RAM devices.
- *
- * Theory of operation:
- *
- * At the lowest level, there is the standard driver for the CD/DVD device,
- * such as drivers/scsi/sr.c. This driver can handle read and write requests,
- * but it doesn't know anything about the special restrictions that apply to
- * packet writing. One restriction is that write requests must be aligned to
- * packet boundaries on the physical media, and the size of a write request
- * must be equal to the packet size. Another restriction is that a
- * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read
- * command, if the previous command was a write.
- *
- * The purpose of the packet writing driver is to hide these restrictions from
- * higher layers, such as file systems, and present a block device that can be
- * randomly read and written using 2kB-sized blocks.
- *
- * The lowest layer in the packet writing driver is the packet I/O scheduler.
- * Its data is defined by the struct packet_iosched and includes two bio
- * queues with pending read and write requests. These queues are processed
- * by the pkt_iosched_process_queue() function. The write requests in this
- * queue are already properly aligned and sized. This layer is responsible for
- * issuing the flush cache commands and scheduling the I/O in a good order.
- *
- * The next layer transforms unaligned write requests to aligned writes. This
- * transformation requires reading missing pieces of data from the underlying
- * block device, assembling the pieces to full packets and queuing them to the
- * packet I/O scheduler.
- *
- * At the top layer there is a custom ->submit_bio function that forwards
- * read requests directly to the iosched queue and puts write requests in the
- * unaligned write queue. A kernel thread performs the necessary read
- * gathering to convert the unaligned writes to aligned writes and then feeds
- * them to the packet I/O scheduler.
- *
- *************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/backing-dev.h>
-#include <linux/compat.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/kthread.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/nospec.h>
-#include <linux/pktcdvd.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_ioctl.h>
-
-#include <linux/unaligned.h>
-
-#define DRIVER_NAME "pktcdvd"
-
-#define MAX_SPEED 0xffff
-
-static DEFINE_MUTEX(pktcdvd_mutex);
-static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
-static struct proc_dir_entry *pkt_proc;
-static int pktdev_major;
-static int write_congestion_on = PKT_WRITE_CONGESTION_ON;
-static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
-static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */
-static mempool_t psd_pool;
-static struct bio_set pkt_bio_set;
-
-/* /sys/class/pktcdvd */
-static struct class class_pktcdvd;
-static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
-
-/* forward declaration */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
-static int pkt_remove_dev(dev_t pkt_dev);
-
-static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
-{
- return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
-}
-
-/**********************************************************
- * sysfs interface for pktcdvd
- * by (C) 2006 Thomas Maier <balagi@justmail.de>
-
- /sys/class/pktcdvd/pktcdvd[0-7]/
- stat/reset
- stat/packets_started
- stat/packets_finished
- stat/kb_written
- stat/kb_read
- stat/kb_read_gather
- write_queue/size
- write_queue/congestion_off
- write_queue/congestion_on
- **********************************************************/
-
-static ssize_t packets_started_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started);
-}
-static DEVICE_ATTR_RO(packets_started);
-
-static ssize_t packets_finished_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended);
-}
-static DEVICE_ATTR_RO(packets_finished);
-
-static ssize_t kb_written_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1);
-}
-static DEVICE_ATTR_RO(kb_written);
-
-static ssize_t kb_read_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1);
-}
-static DEVICE_ATTR_RO(kb_read);
-
-static ssize_t kb_read_gather_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1);
-}
-static DEVICE_ATTR_RO(kb_read_gather);
-
-static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- if (len > 0) {
- pd->stats.pkt_started = 0;
- pd->stats.pkt_ended = 0;
- pd->stats.secs_w = 0;
- pd->stats.secs_rg = 0;
- pd->stats.secs_r = 0;
- }
- return len;
-}
-static DEVICE_ATTR_WO(reset);
-
-static struct attribute *pkt_stat_attrs[] = {
- &dev_attr_packets_finished.attr,
- &dev_attr_packets_started.attr,
- &dev_attr_kb_read.attr,
- &dev_attr_kb_written.attr,
- &dev_attr_kb_read_gather.attr,
- &dev_attr_reset.attr,
- NULL,
-};
-
-static const struct attribute_group pkt_stat_group = {
- .name = "stat",
- .attrs = pkt_stat_attrs,
-};
-
-static ssize_t size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->bio_queue_size);
- spin_unlock(&pd->lock);
- return n;
-}
-static DEVICE_ATTR_RO(size);
-
-static void init_write_congestion_marks(int* lo, int* hi)
-{
- if (*hi > 0) {
- *hi = max(*hi, 500);
- *hi = min(*hi, 1000000);
- if (*lo <= 0)
- *lo = *hi - 100;
- else {
- *lo = min(*lo, *hi - 100);
- *lo = max(*lo, 100);
- }
- } else {
- *hi = -1;
- *lo = -1;
- }
-}
-
-static ssize_t congestion_off_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->write_congestion_off);
- spin_unlock(&pd->lock);
- return n;
-}
-
-static ssize_t congestion_off_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int val, ret;
-
- ret = kstrtoint(buf, 10, &val);
- if (ret)
- return ret;
-
- spin_lock(&pd->lock);
- pd->write_congestion_off = val;
- init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on);
- spin_unlock(&pd->lock);
- return len;
-}
-static DEVICE_ATTR_RW(congestion_off);
-
-static ssize_t congestion_on_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->write_congestion_on);
- spin_unlock(&pd->lock);
- return n;
-}
-
-static ssize_t congestion_on_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int val, ret;
-
- ret = kstrtoint(buf, 10, &val);
- if (ret)
- return ret;
-
- spin_lock(&pd->lock);
- pd->write_congestion_on = val;
- init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on);
- spin_unlock(&pd->lock);
- return len;
-}
-static DEVICE_ATTR_RW(congestion_on);
-
-static struct attribute *pkt_wq_attrs[] = {
- &dev_attr_congestion_on.attr,
- &dev_attr_congestion_off.attr,
- &dev_attr_size.attr,
- NULL,
-};
-
-static const struct attribute_group pkt_wq_group = {
- .name = "write_queue",
- .attrs = pkt_wq_attrs,
-};
-
-static const struct attribute_group *pkt_groups[] = {
- &pkt_stat_group,
- &pkt_wq_group,
- NULL,
-};
-
-static void pkt_sysfs_dev_new(struct pktcdvd_device *pd)
-{
- if (class_is_registered(&class_pktcdvd)) {
- pd->dev = device_create_with_groups(&class_pktcdvd, NULL,
- MKDEV(0, 0), pd, pkt_groups,
- "%s", pd->disk->disk_name);
- if (IS_ERR(pd->dev))
- pd->dev = NULL;
- }
-}
-
-static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
-{
- if (class_is_registered(&class_pktcdvd))
- device_unregister(pd->dev);
-}
-
-
-/********************************************************************
- /sys/class/pktcdvd/
- add map block device
- remove unmap packet dev
- device_map show mappings
- *******************************************************************/
-
-static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr,
- char *data)
-{
- int n = 0;
- int idx;
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- for (idx = 0; idx < MAX_WRITERS; idx++) {
- struct pktcdvd_device *pd = pkt_devs[idx];
- if (!pd)
- continue;
- n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n",
- pd->disk->disk_name,
- MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
- MAJOR(file_bdev(pd->bdev_file)->bd_dev),
- MINOR(file_bdev(pd->bdev_file)->bd_dev));
- }
- mutex_unlock(&ctl_mutex);
- return n;
-}
-static CLASS_ATTR_RO(device_map);
-
-static ssize_t add_store(const struct class *c, const struct class_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int major, minor;
-
- if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
- /* pkt_setup_dev() expects caller to hold reference to self */
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
-
- pkt_setup_dev(MKDEV(major, minor), NULL);
-
- module_put(THIS_MODULE);
-
- return count;
- }
-
- return -EINVAL;
-}
-static CLASS_ATTR_WO(add);
-
-static ssize_t remove_store(const struct class *c, const struct class_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int major, minor;
- if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
- pkt_remove_dev(MKDEV(major, minor));
- return count;
- }
- return -EINVAL;
-}
-static CLASS_ATTR_WO(remove);
-
-static struct attribute *class_pktcdvd_attrs[] = {
- &class_attr_add.attr,
- &class_attr_remove.attr,
- &class_attr_device_map.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(class_pktcdvd);
-
-static struct class class_pktcdvd = {
- .name = DRIVER_NAME,
- .class_groups = class_pktcdvd_groups,
-};
-
-static int pkt_sysfs_init(void)
-{
- /*
- * create control files in sysfs
- * /sys/class/pktcdvd/...
- */
- return class_register(&class_pktcdvd);
-}
-
-static void pkt_sysfs_cleanup(void)
-{
- class_unregister(&class_pktcdvd);
-}
-
-/********************************************************************
- entries in debugfs
-
- /sys/kernel/debug/pktcdvd[0-7]/
- info
-
- *******************************************************************/
-
-static void pkt_count_states(struct pktcdvd_device *pd, int *states)
-{
- struct packet_data *pkt;
- int i;
-
- for (i = 0; i < PACKET_NUM_STATES; i++)
- states[i] = 0;
-
- spin_lock(&pd->cdrw.active_list_lock);
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- states[pkt->state]++;
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-static int pkt_seq_show(struct seq_file *m, void *p)
-{
- struct pktcdvd_device *pd = m->private;
- char *msg;
- int states[PACKET_NUM_STATES];
-
- seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name,
- file_bdev(pd->bdev_file));
-
- seq_printf(m, "\nSettings:\n");
- seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
-
- if (pd->settings.write_type == 0)
- msg = "Packet";
- else
- msg = "Unknown";
- seq_printf(m, "\twrite type:\t\t%s\n", msg);
-
- seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
- seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
-
- seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
-
- if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
- msg = "Mode 1";
- else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
- msg = "Mode 2";
- else
- msg = "Unknown";
- seq_printf(m, "\tblock mode:\t\t%s\n", msg);
-
- seq_printf(m, "\nStatistics:\n");
- seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
- seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
- seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
- seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
- seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
-
- seq_printf(m, "\nMisc:\n");
- seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
- seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
- seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
- seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
- seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
- seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
-
- seq_printf(m, "\nQueue state:\n");
- seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
- seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
- seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", pd->current_sector);
-
- pkt_count_states(pd, states);
- seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
- states[0], states[1], states[2], states[3], states[4], states[5]);
-
- seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n",
- pd->write_congestion_off,
- pd->write_congestion_on);
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(pkt_seq);
-
-static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
-{
- if (!pkt_debugfs_root)
- return;
- pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root);
-
- pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root,
- pd, &pkt_seq_fops);
-}
-
-static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
-{
- if (!pkt_debugfs_root)
- return;
- debugfs_remove(pd->dfs_f_info);
- debugfs_remove(pd->dfs_d_root);
- pd->dfs_f_info = NULL;
- pd->dfs_d_root = NULL;
-}
-
-static void pkt_debugfs_init(void)
-{
- pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
-}
-
-static void pkt_debugfs_cleanup(void)
-{
- debugfs_remove(pkt_debugfs_root);
- pkt_debugfs_root = NULL;
-}
-
-/* ----------------------------------------------------------*/
-
-
-static void pkt_bio_finished(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
- if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
- dev_dbg(ddev, "queue empty\n");
- atomic_set(&pd->iosched.attention, 1);
- wake_up(&pd->wqueue);
- }
-}
-
-/*
- * Allocate a packet_data struct
- */
-static struct packet_data *pkt_alloc_packet_data(int frames)
-{
- int i;
- struct packet_data *pkt;
-
- pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL);
- if (!pkt)
- goto no_pkt;
-
- pkt->frames = frames;
- pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL);
- if (!pkt->w_bio)
- goto no_bio;
-
- for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
- pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!pkt->pages[i])
- goto no_page;
- }
-
- spin_lock_init(&pkt->lock);
- bio_list_init(&pkt->orig_bios);
-
- for (i = 0; i < frames; i++) {
- pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL);
- if (!pkt->r_bios[i])
- goto no_rd_bio;
- }
-
- return pkt;
-
-no_rd_bio:
- for (i = 0; i < frames; i++)
- kfree(pkt->r_bios[i]);
-no_page:
- for (i = 0; i < frames / FRAMES_PER_PAGE; i++)
- if (pkt->pages[i])
- __free_page(pkt->pages[i]);
- kfree(pkt->w_bio);
-no_bio:
- kfree(pkt);
-no_pkt:
- return NULL;
-}
-
-/*
- * Free a packet_data struct
- */
-static void pkt_free_packet_data(struct packet_data *pkt)
-{
- int i;
-
- for (i = 0; i < pkt->frames; i++)
- kfree(pkt->r_bios[i]);
- for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++)
- __free_page(pkt->pages[i]);
- kfree(pkt->w_bio);
- kfree(pkt);
-}
-
-static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
-{
- struct packet_data *pkt, *next;
-
- BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
-
- list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
- pkt_free_packet_data(pkt);
- }
- INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
-}
-
-static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
-{
- struct packet_data *pkt;
-
- BUG_ON(!list_empty(&pd->cdrw.pkt_free_list));
-
- while (nr_packets > 0) {
- pkt = pkt_alloc_packet_data(pd->settings.size >> 2);
- if (!pkt) {
- pkt_shrink_pktlist(pd);
- return 0;
- }
- pkt->id = nr_packets;
- pkt->pd = pd;
- list_add(&pkt->list, &pd->cdrw.pkt_free_list);
- nr_packets--;
- }
- return 1;
-}
-
-static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
-{
- struct rb_node *n = rb_next(&node->rb_node);
- if (!n)
- return NULL;
- return rb_entry(n, struct pkt_rb_node, rb_node);
-}
-
-static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
- rb_erase(&node->rb_node, &pd->bio_queue);
- mempool_free(node, &pd->rb_pool);
- pd->bio_queue_size--;
- BUG_ON(pd->bio_queue_size < 0);
-}
-
-/*
- * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
- */
-static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
-{
- struct rb_node *n = pd->bio_queue.rb_node;
- struct rb_node *next;
- struct pkt_rb_node *tmp;
-
- if (!n) {
- BUG_ON(pd->bio_queue_size > 0);
- return NULL;
- }
-
- for (;;) {
- tmp = rb_entry(n, struct pkt_rb_node, rb_node);
- if (s <= tmp->bio->bi_iter.bi_sector)
- next = n->rb_left;
- else
- next = n->rb_right;
- if (!next)
- break;
- n = next;
- }
-
- if (s > tmp->bio->bi_iter.bi_sector) {
- tmp = pkt_rbtree_next(tmp);
- if (!tmp)
- return NULL;
- }
- BUG_ON(s > tmp->bio->bi_iter.bi_sector);
- return tmp;
-}
-
-/*
- * Insert a node into the pd->bio_queue rb tree.
- */
-static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
- struct rb_node **p = &pd->bio_queue.rb_node;
- struct rb_node *parent = NULL;
- sector_t s = node->bio->bi_iter.bi_sector;
- struct pkt_rb_node *tmp;
-
- while (*p) {
- parent = *p;
- tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
- if (s < tmp->bio->bi_iter.bi_sector)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
- rb_link_node(&node->rb_node, parent, p);
- rb_insert_color(&node->rb_node, &pd->bio_queue);
- pd->bio_queue_size++;
-}
-
-/*
- * Send a packet_command to the underlying block device and
- * wait for completion.
- */
-static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
- struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file));
- struct scsi_cmnd *scmd;
- struct request *rq;
- int ret = 0;
-
- rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
- REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
- scmd = blk_mq_rq_to_pdu(rq);
-
- if (cgc->buflen) {
- ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
- GFP_NOIO);
- if (ret)
- goto out;
- }
-
- scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
- memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE);
-
- rq->timeout = 60*HZ;
- if (cgc->quiet)
- rq->rq_flags |= RQF_QUIET;
-
- blk_execute_rq(rq, false);
- if (scmd->result)
- ret = -EIO;
-out:
- blk_mq_free_request(rq);
- return ret;
-}
-
-static const char *sense_key_string(__u8 index)
-{
- static const char * const info[] = {
- "No sense", "Recovered error", "Not ready",
- "Medium error", "Hardware error", "Illegal request",
- "Unit attention", "Data protect", "Blank check",
- };
-
- return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
-}
-
-/*
- * A generic sense dump / resolve mechanism should be implemented across
- * all ATAPI + SCSI devices.
- */
-static void pkt_dump_sense(struct pktcdvd_device *pd,
- struct packet_command *cgc)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct scsi_sense_hdr *sshdr = cgc->sshdr;
-
- if (sshdr)
- dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n",
- CDROM_PACKET_SIZE, cgc->cmd,
- sshdr->sense_key, sshdr->asc, sshdr->ascq,
- sense_key_string(sshdr->sense_key));
- else
- dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
-}
-
-/*
- * flush the drive cache to media
- */
-static int pkt_flush_cache(struct pktcdvd_device *pd)
-{
- struct packet_command cgc;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.cmd[0] = GPCMD_FLUSH_CACHE;
- cgc.quiet = 1;
-
- /*
- * the IMMED bit -- we default to not setting it, although that
- * would allow a much faster close, this is safer
- */
-#if 0
- cgc.cmd[1] = 1 << 1;
-#endif
- return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * speed is given as the normal factor, e.g. 4 for 4x
- */
-static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
- unsigned write_speed, unsigned read_speed)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- int ret;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_SET_SPEED;
- put_unaligned_be16(read_speed, &cgc.cmd[2]);
- put_unaligned_be16(write_speed, &cgc.cmd[4]);
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- pkt_dump_sense(pd, &cgc);
-
- return ret;
-}
-
-/*
- * Queue a bio for processing by the low-level CD device. Must be called
- * from process context.
- */
-static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
-{
- /*
- * Some CDRW drives can not handle writes larger than one packet,
- * even if the size is a multiple of the packet size.
- */
- bio->bi_opf |= REQ_NOMERGE;
-
- spin_lock(&pd->iosched.lock);
- if (bio_data_dir(bio) == READ)
- bio_list_add(&pd->iosched.read_queue, bio);
- else
- bio_list_add(&pd->iosched.write_queue, bio);
- spin_unlock(&pd->iosched.lock);
-
- atomic_set(&pd->iosched.attention, 1);
- wake_up(&pd->wqueue);
-}
-
-/*
- * Process the queued read/write requests. This function handles special
- * requirements for CDRW drives:
- * - A cache flush command must be inserted before a read request if the
- * previous request was a write.
- * - Switching between reading and writing is slow, so don't do it more often
- * than necessary.
- * - Optimize for throughput at the expense of latency. This means that streaming
- * writes will never be interrupted by a read, but if the drive has to seek
- * before the next write, switch to reading instead if there are any pending
- * read requests.
- * - Set the read speed according to current usage pattern. When only reading
- * from the device, it's best to use the highest possible read speed, but
- * when switching often between reading and writing, it's better to have the
- * same read and write speeds.
- */
-static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- if (atomic_read(&pd->iosched.attention) == 0)
- return;
- atomic_set(&pd->iosched.attention, 0);
-
- for (;;) {
- struct bio *bio;
- int reads_queued, writes_queued;
-
- spin_lock(&pd->iosched.lock);
- reads_queued = !bio_list_empty(&pd->iosched.read_queue);
- writes_queued = !bio_list_empty(&pd->iosched.write_queue);
- spin_unlock(&pd->iosched.lock);
-
- if (!reads_queued && !writes_queued)
- break;
-
- if (pd->iosched.writing) {
- int need_write_seek = 1;
- spin_lock(&pd->iosched.lock);
- bio = bio_list_peek(&pd->iosched.write_queue);
- spin_unlock(&pd->iosched.lock);
- if (bio && (bio->bi_iter.bi_sector ==
- pd->iosched.last_write))
- need_write_seek = 0;
- if (need_write_seek && reads_queued) {
- if (atomic_read(&pd->cdrw.pending_bios) > 0) {
- dev_dbg(ddev, "write, waiting\n");
- break;
- }
- pkt_flush_cache(pd);
- pd->iosched.writing = 0;
- }
- } else {
- if (!reads_queued && writes_queued) {
- if (atomic_read(&pd->cdrw.pending_bios) > 0) {
- dev_dbg(ddev, "read, waiting\n");
- break;
- }
- pd->iosched.writing = 1;
- }
- }
-
- spin_lock(&pd->iosched.lock);
- if (pd->iosched.writing)
- bio = bio_list_pop(&pd->iosched.write_queue);
- else
- bio = bio_list_pop(&pd->iosched.read_queue);
- spin_unlock(&pd->iosched.lock);
-
- if (!bio)
- continue;
-
- if (bio_data_dir(bio) == READ)
- pd->iosched.successive_reads +=
- bio->bi_iter.bi_size >> 10;
- else {
- pd->iosched.successive_reads = 0;
- pd->iosched.last_write = bio_end_sector(bio);
- }
- if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
- if (pd->read_speed == pd->write_speed) {
- pd->read_speed = MAX_SPEED;
- pkt_set_speed(pd, pd->write_speed, pd->read_speed);
- }
- } else {
- if (pd->read_speed != pd->write_speed) {
- pd->read_speed = pd->write_speed;
- pkt_set_speed(pd, pd->write_speed, pd->read_speed);
- }
- }
-
- atomic_inc(&pd->cdrw.pending_bios);
- submit_bio_noacct(bio);
- }
-}
-
-/*
- * Special care is needed if the underlying block device has a small
- * max_phys_segments value.
- */
-static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) {
- /*
- * The cdrom device can handle one segment/frame
- */
- clear_bit(PACKET_MERGE_SEGS, &pd->flags);
- return 0;
- }
-
- if ((pd->settings.size << 9) / PAGE_SIZE <= queue_max_segments(q)) {
- /*
- * We can handle this case at the expense of some extra memory
- * copies during write operations
- */
- set_bit(PACKET_MERGE_SEGS, &pd->flags);
- return 0;
- }
-
- dev_err(ddev, "cdrom max_phys_segments too small\n");
- return -EIO;
-}
-
-static void pkt_end_io_read(struct bio *bio)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n",
- bio, pkt->sector, bio->bi_iter.bi_sector, bio->bi_status);
-
- if (bio->bi_status)
- atomic_inc(&pkt->io_errors);
- bio_uninit(bio);
- if (atomic_dec_and_test(&pkt->io_wait)) {
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
- }
- pkt_bio_finished(pd);
-}
-
-static void pkt_end_io_packet_write(struct bio *bio)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status);
-
- pd->stats.pkt_ended++;
-
- bio_uninit(bio);
- pkt_bio_finished(pd);
- atomic_dec(&pkt->io_wait);
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
-}
-
-/*
- * Schedule reads for the holes in a packet
- */
-static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- int frames_read = 0;
- struct bio *bio;
- int f;
- char written[PACKET_MAX_SIZE];
-
- BUG_ON(bio_list_empty(&pkt->orig_bios));
-
- atomic_set(&pkt->io_wait, 0);
- atomic_set(&pkt->io_errors, 0);
-
- /*
- * Figure out which frames we need to read before we can write.
- */
- memset(written, 0, sizeof(written));
- spin_lock(&pkt->lock);
- bio_list_for_each(bio, &pkt->orig_bios) {
- int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
- (CD_FRAMESIZE >> 9);
- int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
- pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
- BUG_ON(first_frame < 0);
- BUG_ON(first_frame + num_frames > pkt->frames);
- for (f = first_frame; f < first_frame + num_frames; f++)
- written[f] = 1;
- }
- spin_unlock(&pkt->lock);
-
- if (pkt->cache_valid) {
- dev_dbg(ddev, "zone %llx cached\n", pkt->sector);
- goto out_account;
- }
-
- /*
- * Schedule reads for missing parts of the packet.
- */
- for (f = 0; f < pkt->frames; f++) {
- int p, offset;
-
- if (written[f])
- continue;
-
- bio = pkt->r_bios[f];
- bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1,
- REQ_OP_READ);
- bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
- bio->bi_end_io = pkt_end_io_read;
- bio->bi_private = pkt;
-
- p = (f * CD_FRAMESIZE) / PAGE_SIZE;
- offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
- dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f,
- pkt->pages[p], offset);
- if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
- BUG();
-
- atomic_inc(&pkt->io_wait);
- pkt_queue_bio(pd, bio);
- frames_read++;
- }
-
-out_account:
- dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, pkt->sector);
- pd->stats.pkt_started++;
- pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
-}
-
-/*
- * Find a packet matching zone, or the least recently used packet if
- * there is no match.
- */
-static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
-{
- struct packet_data *pkt;
-
- list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
- if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
- list_del_init(&pkt->list);
- if (pkt->sector != zone)
- pkt->cache_valid = 0;
- return pkt;
- }
- }
- BUG();
- return NULL;
-}
-
-static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- if (pkt->cache_valid) {
- list_add(&pkt->list, &pd->cdrw.pkt_free_list);
- } else {
- list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
- }
-}
-
-static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt,
- enum packet_data_state state)
-{
- static const char *state_name[] = {
- "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
- };
- enum packet_data_state old_state = pkt->state;
-
- dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n",
- pkt->id, pkt->sector, state_name[old_state], state_name[state]);
-
- pkt->state = state;
-}
-
-/*
- * Scan the work queue to see if we can start a new packet.
- * returns non-zero if any work was done.
- */
-static int pkt_handle_queue(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_data *pkt, *p;
- struct bio *bio = NULL;
- sector_t zone = 0; /* Suppress gcc warning */
- struct pkt_rb_node *node, *first_node;
- struct rb_node *n;
-
- atomic_set(&pd->scan_queue, 0);
-
- if (list_empty(&pd->cdrw.pkt_free_list)) {
- dev_dbg(ddev, "no pkt\n");
- return 0;
- }
-
- /*
- * Try to find a zone we are not already working on.
- */
- spin_lock(&pd->lock);
- first_node = pkt_rbtree_find(pd, pd->current_sector);
- if (!first_node) {
- n = rb_first(&pd->bio_queue);
- if (n)
- first_node = rb_entry(n, struct pkt_rb_node, rb_node);
- }
- node = first_node;
- while (node) {
- bio = node->bio;
- zone = get_zone(bio->bi_iter.bi_sector, pd);
- list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
- if (p->sector == zone) {
- bio = NULL;
- goto try_next_bio;
- }
- }
- break;
-try_next_bio:
- node = pkt_rbtree_next(node);
- if (!node) {
- n = rb_first(&pd->bio_queue);
- if (n)
- node = rb_entry(n, struct pkt_rb_node, rb_node);
- }
- if (node == first_node)
- node = NULL;
- }
- spin_unlock(&pd->lock);
- if (!bio) {
- dev_dbg(ddev, "no bio\n");
- return 0;
- }
-
- pkt = pkt_get_packet_data(pd, zone);
-
- pd->current_sector = zone + pd->settings.size;
- pkt->sector = zone;
- BUG_ON(pkt->frames != pd->settings.size >> 2);
- pkt->write_size = 0;
-
- /*
- * Scan work queue for bios in the same zone and link them
- * to this packet.
- */
- spin_lock(&pd->lock);
- dev_dbg(ddev, "looking for zone %llx\n", zone);
- while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
- sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd);
-
- bio = node->bio;
- dev_dbg(ddev, "found zone=%llx\n", tmp);
- if (tmp != zone)
- break;
- pkt_rbtree_erase(pd, node);
- spin_lock(&pkt->lock);
- bio_list_add(&pkt->orig_bios, bio);
- pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
- spin_unlock(&pkt->lock);
- }
- /* check write congestion marks, and if bio_queue_size is
- * below, wake up any waiters
- */
- if (pd->congested &&
- pd->bio_queue_size <= pd->write_congestion_off) {
- pd->congested = false;
- wake_up_var(&pd->congested);
- }
- spin_unlock(&pd->lock);
-
- pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
- pkt_set_state(ddev, pkt, PACKET_WAITING_STATE);
- atomic_set(&pkt->run_sm, 1);
-
- spin_lock(&pd->cdrw.active_list_lock);
- list_add(&pkt->list, &pd->cdrw.pkt_active_list);
- spin_unlock(&pd->cdrw.active_list_lock);
-
- return 1;
-}
-
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-static void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
- struct bvec_iter src_iter = src->bi_iter;
- struct bvec_iter dst_iter = dst->bi_iter;
-
- while (1) {
- if (!src_iter.bi_size) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_iter = src->bi_iter;
- }
-
- if (!dst_iter.bi_size) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_iter = dst->bi_iter;
- }
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
-}
-
-/*
- * Assemble a bio to write one packet and queue the bio for processing
- * by the underlying block device.
- */
-static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- int f;
-
- bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs,
- pkt->frames, REQ_OP_WRITE);
- pkt->w_bio->bi_iter.bi_sector = pkt->sector;
- pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
- pkt->w_bio->bi_private = pkt;
-
- /* XXX: locking? */
- for (f = 0; f < pkt->frames; f++) {
- struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
- unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-
- if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset))
- BUG();
- }
- dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
- /*
- * Fill-in bvec with data from orig_bios.
- */
- spin_lock(&pkt->lock);
- bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
-
- pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE);
- spin_unlock(&pkt->lock);
-
- dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, pkt->sector);
-
- if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames))
- pkt->cache_valid = 1;
- else
- pkt->cache_valid = 0;
-
- /* Start the write request */
- atomic_set(&pkt->io_wait, 1);
- pkt_queue_bio(pd, pkt->w_bio);
-}
-
-static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status)
-{
- struct bio *bio;
-
- if (status)
- pkt->cache_valid = 0;
-
- /* Finish all bios corresponding to this packet */
- while ((bio = bio_list_pop(&pkt->orig_bios))) {
- bio->bi_status = status;
- bio_endio(bio);
- }
-}
-
-static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- dev_dbg(ddev, "pkt %d\n", pkt->id);
-
- for (;;) {
- switch (pkt->state) {
- case PACKET_WAITING_STATE:
- if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
- return;
-
- pkt->sleep_time = 0;
- pkt_gather_data(pd, pkt);
- pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE);
- break;
-
- case PACKET_READ_WAIT_STATE:
- if (atomic_read(&pkt->io_wait) > 0)
- return;
-
- if (atomic_read(&pkt->io_errors) > 0) {
- pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE);
- } else {
- pkt_start_write(pd, pkt);
- }
- break;
-
- case PACKET_WRITE_WAIT_STATE:
- if (atomic_read(&pkt->io_wait) > 0)
- return;
-
- if (!pkt->w_bio->bi_status) {
- pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE);
- } else {
- pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE);
- }
- break;
-
- case PACKET_RECOVERY_STATE:
- dev_dbg(ddev, "No recovery possible\n");
- pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE);
- break;
-
- case PACKET_FINISHED_STATE:
- pkt_finish_packet(pkt, pkt->w_bio->bi_status);
- return;
-
- default:
- BUG();
- break;
- }
- }
-}
-
-static void pkt_handle_packets(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_data *pkt, *next;
-
- /*
- * Run state machine for active packets
- */
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (atomic_read(&pkt->run_sm) > 0) {
- atomic_set(&pkt->run_sm, 0);
- pkt_run_state_machine(pd, pkt);
- }
- }
-
- /*
- * Move no longer active packets to the free list
- */
- spin_lock(&pd->cdrw.active_list_lock);
- list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
- if (pkt->state == PACKET_FINISHED_STATE) {
- list_del(&pkt->list);
- pkt_put_packet_data(pd, pkt);
- pkt_set_state(ddev, pkt, PACKET_IDLE_STATE);
- atomic_set(&pd->scan_queue, 1);
- }
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-/*
- * kcdrwd is woken up when writes have been queued for one of our
- * registered devices
- */
-static int kcdrwd(void *foobar)
-{
- struct pktcdvd_device *pd = foobar;
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_data *pkt;
- int states[PACKET_NUM_STATES];
- long min_sleep_time, residue;
-
- set_user_nice(current, MIN_NICE);
- set_freezable();
-
- for (;;) {
- DECLARE_WAITQUEUE(wait, current);
-
- /*
- * Wait until there is something to do
- */
- add_wait_queue(&pd->wqueue, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- /* Check if we need to run pkt_handle_queue */
- if (atomic_read(&pd->scan_queue) > 0)
- goto work_to_do;
-
- /* Check if we need to run the state machine for some packet */
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (atomic_read(&pkt->run_sm) > 0)
- goto work_to_do;
- }
-
- /* Check if we need to process the iosched queues */
- if (atomic_read(&pd->iosched.attention) != 0)
- goto work_to_do;
-
- /* Otherwise, go to sleep */
- pkt_count_states(pd, states);
- dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
- states[0], states[1], states[2], states[3], states[4], states[5]);
-
- min_sleep_time = MAX_SCHEDULE_TIMEOUT;
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
- min_sleep_time = pkt->sleep_time;
- }
-
- dev_dbg(ddev, "sleeping\n");
- residue = schedule_timeout(min_sleep_time);
- dev_dbg(ddev, "wake up\n");
-
- /* make swsusp happy with our thread */
- try_to_freeze();
-
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (!pkt->sleep_time)
- continue;
- pkt->sleep_time -= min_sleep_time - residue;
- if (pkt->sleep_time <= 0) {
- pkt->sleep_time = 0;
- atomic_inc(&pkt->run_sm);
- }
- }
-
- if (kthread_should_stop())
- break;
- }
-work_to_do:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&pd->wqueue, &wait);
-
- if (kthread_should_stop())
- break;
-
- /*
- * if pkt_handle_queue returns true, we can queue
- * another request.
- */
- while (pkt_handle_queue(pd))
- ;
-
- /*
- * Handle packet state machine
- */
- pkt_handle_packets(pd);
-
- /*
- * Handle iosched queues
- */
- pkt_iosched_process_queue(pd);
- }
-
- return 0;
-}
-
-static void pkt_print_settings(struct pktcdvd_device *pd)
-{
- dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n",
- pd->settings.fp ? "Fixed" : "Variable",
- pd->settings.size >> 2,
- pd->settings.block_mode == 8 ? '1' : '2');
-}
-
-static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
-{
- memset(cgc->cmd, 0, sizeof(cgc->cmd));
-
- cgc->cmd[0] = GPCMD_MODE_SENSE_10;
- cgc->cmd[2] = page_code | (page_control << 6);
- put_unaligned_be16(cgc->buflen, &cgc->cmd[7]);
- cgc->data_direction = CGC_DATA_READ;
- return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
- memset(cgc->cmd, 0, sizeof(cgc->cmd));
- memset(cgc->buffer, 0, 2);
- cgc->cmd[0] = GPCMD_MODE_SELECT_10;
- cgc->cmd[1] = 0x10; /* PF */
- put_unaligned_be16(cgc->buflen, &cgc->cmd[7]);
- cgc->data_direction = CGC_DATA_WRITE;
- return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
-{
- struct packet_command cgc;
- int ret;
-
- /* set up command and get the disc info */
- init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_READ_DISC_INFO;
- cgc.cmd[8] = cgc.buflen = 2;
- cgc.quiet = 1;
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- return ret;
-
- /* not all drives have the same disc_info length, so requeue
- * packet with the length the drive tells us it can supply
- */
- cgc.buflen = be16_to_cpu(di->disc_information_length) +
- sizeof(di->disc_information_length);
-
- if (cgc.buflen > sizeof(disc_information))
- cgc.buflen = sizeof(disc_information);
-
- cgc.cmd[8] = cgc.buflen;
- return pkt_generic_packet(pd, &cgc);
-}
-
-static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
-{
- struct packet_command cgc;
- int ret;
-
- init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
- cgc.cmd[1] = type & 3;
- put_unaligned_be16(track, &cgc.cmd[4]);
- cgc.cmd[8] = 8;
- cgc.quiet = 1;
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- return ret;
-
- cgc.buflen = be16_to_cpu(ti->track_information_length) +
- sizeof(ti->track_information_length);
-
- if (cgc.buflen > sizeof(track_information))
- cgc.buflen = sizeof(track_information);
-
- cgc.cmd[8] = cgc.buflen;
- return pkt_generic_packet(pd, &cgc);
-}
-
-static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
- long *last_written)
-{
- disc_information di;
- track_information ti;
- __u32 last_track;
- int ret;
-
- ret = pkt_get_disc_info(pd, &di);
- if (ret)
- return ret;
-
- last_track = (di.last_track_msb << 8) | di.last_track_lsb;
- ret = pkt_get_track_info(pd, last_track, 1, &ti);
- if (ret)
- return ret;
-
- /* if this track is blank, try the previous. */
- if (ti.blank) {
- last_track--;
- ret = pkt_get_track_info(pd, last_track, 1, &ti);
- if (ret)
- return ret;
- }
-
- /* if last recorded field is valid, return it. */
- if (ti.lra_v) {
- *last_written = be32_to_cpu(ti.last_rec_address);
- } else {
- /* make it up instead */
- *last_written = be32_to_cpu(ti.track_start) +
- be32_to_cpu(ti.track_size);
- if (ti.free_blocks)
- *last_written -= (be32_to_cpu(ti.free_blocks) + 7);
- }
- return 0;
-}
-
-/*
- * write mode select package based on pd->settings
- */
-static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- write_param_page *wp;
- char buffer[128];
- int ret, size;
-
- /* doesn't apply to DVD+RW or DVD-RAM */
- if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12))
- return 0;
-
- memset(buffer, 0, sizeof(buffer));
- init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- size = 2 + get_unaligned_be16(&buffer[0]);
- pd->mode_offset = get_unaligned_be16(&buffer[6]);
- if (size > sizeof(buffer))
- size = sizeof(buffer);
-
- /*
- * now get it all
- */
- init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- /*
- * write page is offset header + block descriptor length
- */
- wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
-
- wp->fp = pd->settings.fp;
- wp->track_mode = pd->settings.track_mode;
- wp->write_type = pd->settings.write_type;
- wp->data_block_type = pd->settings.block_mode;
-
- wp->multi_session = 0;
-
-#ifdef PACKET_USE_LS
- wp->link_size = 7;
- wp->ls_v = 1;
-#endif
-
- if (wp->data_block_type == PACKET_BLOCK_MODE1) {
- wp->session_format = 0;
- wp->subhdr2 = 0x20;
- } else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
- wp->session_format = 0x20;
- wp->subhdr2 = 8;
-#if 0
- wp->mcn[0] = 0x80;
- memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
-#endif
- } else {
- /*
- * paranoia
- */
- dev_err(ddev, "write mode wrong %d\n", wp->data_block_type);
- return 1;
- }
- wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
-
- cgc.buflen = cgc.cmd[8] = size;
- ret = pkt_mode_select(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- pkt_print_settings(pd);
- return 0;
-}
-
-/*
- * 1 -- we can write to this track, 0 -- we can't
- */
-static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- switch (pd->mmc3_profile) {
- case 0x1a: /* DVD+RW */
- case 0x12: /* DVD-RAM */
- /* The track is always writable on DVD+RW/DVD-RAM */
- return 1;
- default:
- break;
- }
-
- if (!ti->packet || !ti->fp)
- return 0;
-
- /*
- * "good" settings as per Mt Fuji.
- */
- if (ti->rt == 0 && ti->blank == 0)
- return 1;
-
- if (ti->rt == 0 && ti->blank == 1)
- return 1;
-
- if (ti->rt == 1 && ti->blank == 0)
- return 1;
-
- dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
- return 0;
-}
-
-/*
- * 1 -- we can write to this disc, 0 -- we can't
- */
-static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- switch (pd->mmc3_profile) {
- case 0x0a: /* CD-RW */
- case 0xffff: /* MMC3 not supported */
- break;
- case 0x1a: /* DVD+RW */
- case 0x13: /* DVD-RW */
- case 0x12: /* DVD-RAM */
- return 1;
- default:
- dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile);
- return 0;
- }
-
- /*
- * for disc type 0xff we should probably reserve a new track.
- * but i'm not sure, should we leave this to user apps? probably.
- */
- if (di->disc_type == 0xff) {
- dev_notice(ddev, "unknown disc - no track?\n");
- return 0;
- }
-
- if (di->disc_type != 0x20 && di->disc_type != 0) {
- dev_err(ddev, "wrong disc type (%x)\n", di->disc_type);
- return 0;
- }
-
- if (di->erasable == 0) {
- dev_err(ddev, "disc not erasable\n");
- return 0;
- }
-
- if (di->border_status == PACKET_SESSION_RESERVED) {
- dev_err(ddev, "can't write to last track (reserved)\n");
- return 0;
- }
-
- return 1;
-}
-
-static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_command cgc;
- unsigned char buf[12];
- disc_information di;
- track_information ti;
- int ret, track;
-
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
- cgc.cmd[8] = 8;
- ret = pkt_generic_packet(pd, &cgc);
- pd->mmc3_profile = ret ? 0xffff : get_unaligned_be16(&buf[6]);
-
- memset(&di, 0, sizeof(disc_information));
- memset(&ti, 0, sizeof(track_information));
-
- ret = pkt_get_disc_info(pd, &di);
- if (ret) {
- dev_err(ddev, "failed get_disc\n");
- return ret;
- }
-
- if (!pkt_writable_disc(pd, &di))
- return -EROFS;
-
- pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
-
- track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
- ret = pkt_get_track_info(pd, track, 1, &ti);
- if (ret) {
- dev_err(ddev, "failed get_track\n");
- return ret;
- }
-
- if (!pkt_writable_track(pd, &ti)) {
- dev_err(ddev, "can't write to this track\n");
- return -EROFS;
- }
-
- /*
- * we keep packet size in 512 byte units, makes it easier to
- * deal with request calculations.
- */
- pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
- if (pd->settings.size == 0) {
- dev_notice(ddev, "detected zero packet size!\n");
- return -ENXIO;
- }
- if (pd->settings.size > PACKET_MAX_SECTORS) {
- dev_err(ddev, "packet size is too big\n");
- return -EROFS;
- }
- pd->settings.fp = ti.fp;
- pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
-
- if (ti.nwa_v) {
- pd->nwa = be32_to_cpu(ti.next_writable);
- set_bit(PACKET_NWA_VALID, &pd->flags);
- }
-
- /*
- * in theory we could use lra on -RW media as well and just zero
- * blocks that haven't been written yet, but in practice that
- * is just a no-go. we'll use that for -R, naturally.
- */
- if (ti.lra_v) {
- pd->lra = be32_to_cpu(ti.last_rec_address);
- set_bit(PACKET_LRA_VALID, &pd->flags);
- } else {
- pd->lra = 0xffffffff;
- set_bit(PACKET_LRA_VALID, &pd->flags);
- }
-
- /*
- * fine for now
- */
- pd->settings.link_loss = 7;
- pd->settings.write_type = 0; /* packet */
- pd->settings.track_mode = ti.track_mode;
-
- /*
- * mode1 or mode2 disc
- */
- switch (ti.data_mode) {
- case PACKET_MODE1:
- pd->settings.block_mode = PACKET_BLOCK_MODE1;
- break;
- case PACKET_MODE2:
- pd->settings.block_mode = PACKET_BLOCK_MODE2;
- break;
- default:
- dev_err(ddev, "unknown data mode\n");
- return -EROFS;
- }
- return 0;
-}
-
-/*
- * enable/disable write caching on drive
- */
-static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[64];
- bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE);
- int ret;
-
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.buflen = pd->mode_offset + 12;
-
- /*
- * caching mode page might not be there, so quiet this command
- */
- cgc.quiet = 1;
-
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
- if (ret)
- return ret;
-
- /*
- * use drive write caching -- we need deferred error handling to be
- * able to successfully recover with this option (drive will return good
- * status as soon as the cdb is validated).
- */
- buf[pd->mode_offset + 10] |= (set << 2);
-
- cgc.buflen = cgc.cmd[8] = 2 + get_unaligned_be16(&buf[0]);
- ret = pkt_mode_select(pd, &cgc);
- if (ret) {
- dev_err(ddev, "write caching control failed\n");
- pkt_dump_sense(pd, &cgc);
- } else if (!ret && set)
- dev_notice(ddev, "enabled write caching\n");
- return ret;
-}
-
-static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
-{
- struct packet_command cgc;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
- cgc.cmd[4] = lockflag ? 1 : 0;
- return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * Returns drive maximum write speed
- */
-static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
- unsigned *write_speed)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[256+18];
- unsigned char *cap_buf;
- int ret, offset;
-
- cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
- cgc.sshdr = &sshdr;
-
- ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
- if (ret) {
- cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
- sizeof(struct mode_page_header);
- ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
- }
-
- offset = 20; /* Obsoleted field, used by older drives */
- if (cap_buf[1] >= 28)
- offset = 28; /* Current write speed selected */
- if (cap_buf[1] >= 30) {
- /* If the drive reports at least one "Logical Unit Write
- * Speed Performance Descriptor Block", use the information
- * in the first block. (contains the highest speed)
- */
- int num_spdb = get_unaligned_be16(&cap_buf[30]);
- if (num_spdb > 0)
- offset = 34;
- }
-
- *write_speed = get_unaligned_be16(&cap_buf[offset]);
- return 0;
-}
-
-/* These tables from cdrecord - I don't have orange book */
-/* standard speed CD-RW (1-4x) */
-static char clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* high speed CD-RW (-10x) */
-static char hs_clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* ultra high speed CD-RW */
-static char us_clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
-};
-
-/*
- * reads the maximum media speed from ATIP
- */
-static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
- unsigned *speed)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[64];
- unsigned int size, st, sp;
- int ret;
-
- init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
- cgc.cmd[1] = 2;
- cgc.cmd[2] = 4; /* READ ATIP */
- cgc.cmd[8] = 2;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
- size = 2 + get_unaligned_be16(&buf[0]);
- if (size > sizeof(buf))
- size = sizeof(buf);
-
- init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
- cgc.cmd[1] = 2;
- cgc.cmd[2] = 4;
- cgc.cmd[8] = size;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- if (!(buf[6] & 0x40)) {
- dev_notice(ddev, "disc type is not CD-RW\n");
- return 1;
- }
- if (!(buf[6] & 0x4)) {
- dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n");
- return 1;
- }
-
- st = (buf[6] >> 3) & 0x7; /* disc sub-type */
-
- sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
-
- /* Info from cdrecord */
- switch (st) {
- case 0: /* standard speed */
- *speed = clv_to_speed[sp];
- break;
- case 1: /* high speed */
- *speed = hs_clv_to_speed[sp];
- break;
- case 2: /* ultra high speed */
- *speed = us_clv_to_speed[sp];
- break;
- default:
- dev_notice(ddev, "unknown disc sub-type %d\n", st);
- return 1;
- }
- if (*speed) {
- dev_info(ddev, "maximum media speed: %d\n", *speed);
- return 0;
- } else {
- dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st);
- return 1;
- }
-}
-
-static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- int ret;
-
- dev_dbg(ddev, "Performing OPC\n");
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sshdr = &sshdr;
- cgc.timeout = 60*HZ;
- cgc.cmd[0] = GPCMD_SEND_OPC;
- cgc.cmd[1] = 1;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- pkt_dump_sense(pd, &cgc);
- return ret;
-}
-
-static int pkt_open_write(struct pktcdvd_device *pd)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- int ret;
- unsigned int write_speed, media_write_speed, read_speed;
-
- ret = pkt_probe_settings(pd);
- if (ret) {
- dev_dbg(ddev, "failed probe\n");
- return ret;
- }
-
- ret = pkt_set_write_settings(pd);
- if (ret) {
- dev_notice(ddev, "failed saving write settings\n");
- return -EIO;
- }
-
- pkt_write_caching(pd);
-
- ret = pkt_get_max_speed(pd, &write_speed);
- if (ret)
- write_speed = 16 * 177;
- switch (pd->mmc3_profile) {
- case 0x13: /* DVD-RW */
- case 0x1a: /* DVD+RW */
- case 0x12: /* DVD-RAM */
- dev_notice(ddev, "write speed %ukB/s\n", write_speed);
- break;
- default:
- ret = pkt_media_speed(pd, &media_write_speed);
- if (ret)
- media_write_speed = 16;
- write_speed = min(write_speed, media_write_speed * 177);
- dev_notice(ddev, "write speed %ux\n", write_speed / 176);
- break;
- }
- read_speed = write_speed;
-
- ret = pkt_set_speed(pd, write_speed, read_speed);
- if (ret) {
- dev_notice(ddev, "couldn't set write speed\n");
- return -EIO;
- }
- pd->write_speed = write_speed;
- pd->read_speed = read_speed;
-
- ret = pkt_perform_opc(pd);
- if (ret)
- dev_notice(ddev, "Optimum Power Calibration failed\n");
-
- return 0;
-}
-
-/*
- * called at open time.
- */
-static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- int ret;
- long lba;
- struct request_queue *q;
- struct file *bdev_file;
-
- /*
- * We need to re-open the cdrom device without O_NONBLOCK to be able
- * to read/write from/to it. It is already opened in O_NONBLOCK mode
- * so open should not fail.
- */
- bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev,
- BLK_OPEN_READ, pd, NULL);
- if (IS_ERR(bdev_file)) {
- ret = PTR_ERR(bdev_file);
- goto out;
- }
- pd->f_open_bdev = bdev_file;
-
- ret = pkt_get_last_written(pd, &lba);
- if (ret) {
- dev_err(ddev, "pkt_get_last_written failed\n");
- goto out_putdev;
- }
-
- set_capacity(pd->disk, lba << 2);
- set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2);
-
- q = bdev_get_queue(file_bdev(pd->bdev_file));
- if (write) {
- ret = pkt_open_write(pd);
- if (ret)
- goto out_putdev;
- set_bit(PACKET_WRITABLE, &pd->flags);
- } else {
- pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
- clear_bit(PACKET_WRITABLE, &pd->flags);
- }
-
- ret = pkt_set_segment_merging(pd, q);
- if (ret)
- goto out_putdev;
-
- if (write) {
- if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
- dev_err(ddev, "not enough memory for buffers\n");
- ret = -ENOMEM;
- goto out_putdev;
- }
- dev_info(ddev, "%lukB available on disc\n", lba << 1);
- }
- set_blocksize(bdev_file, CD_FRAMESIZE);
-
- return 0;
-
-out_putdev:
- fput(bdev_file);
-out:
- return ret;
-}
-
-/*
- * called when the device is closed. makes sure that the device flushes
- * the internal cache before we close.
- */
-static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
-{
- struct device *ddev = disk_to_dev(pd->disk);
-
- if (flush && pkt_flush_cache(pd))
- dev_notice(ddev, "not flushing cache\n");
-
- pkt_lock_door(pd, 0);
-
- pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
- fput(pd->f_open_bdev);
- pd->f_open_bdev = NULL;
-
- pkt_shrink_pktlist(pd);
-}
-
-static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
-{
- if (dev_minor >= MAX_WRITERS)
- return NULL;
-
- dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
- return pkt_devs[dev_minor];
-}
-
-static int pkt_open(struct gendisk *disk, blk_mode_t mode)
-{
- struct pktcdvd_device *pd = NULL;
- int ret;
-
- mutex_lock(&pktcdvd_mutex);
- mutex_lock(&ctl_mutex);
- pd = pkt_find_dev_from_minor(disk->first_minor);
- if (!pd) {
- ret = -ENODEV;
- goto out;
- }
- BUG_ON(pd->refcnt < 0);
-
- pd->refcnt++;
- if (pd->refcnt > 1) {
- if ((mode & BLK_OPEN_WRITE) &&
- !test_bit(PACKET_WRITABLE, &pd->flags)) {
- ret = -EBUSY;
- goto out_dec;
- }
- } else {
- ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE);
- if (ret)
- goto out_dec;
- }
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
- return 0;
-
-out_dec:
- pd->refcnt--;
-out:
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
- return ret;
-}
-
-static void pkt_release(struct gendisk *disk)
-{
- struct pktcdvd_device *pd = disk->private_data;
-
- mutex_lock(&pktcdvd_mutex);
- mutex_lock(&ctl_mutex);
- pd->refcnt--;
- BUG_ON(pd->refcnt < 0);
- if (pd->refcnt == 0) {
- int flush = test_bit(PACKET_WRITABLE, &pd->flags);
- pkt_release_dev(pd, flush);
- }
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
-}
-
-
-static void pkt_end_io_read_cloned(struct bio *bio)
-{
- struct packet_stacked_data *psd = bio->bi_private;
- struct pktcdvd_device *pd = psd->pd;
-
- psd->bio->bi_status = bio->bi_status;
- bio_put(bio);
- bio_endio(psd->bio);
- mempool_free(psd, &psd_pool);
- pkt_bio_finished(pd);
-}
-
-static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
-{
- struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio,
- GFP_NOIO, &pkt_bio_set);
- struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
-
- psd->pd = pd;
- psd->bio = bio;
- cloned_bio->bi_private = psd;
- cloned_bio->bi_end_io = pkt_end_io_read_cloned;
- pd->stats.secs_r += bio_sectors(bio);
- pkt_queue_bio(pd, cloned_bio);
-}
-
-static void pkt_make_request_write(struct bio *bio)
-{
- struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
- sector_t zone;
- struct packet_data *pkt;
- int was_empty, blocked_bio;
- struct pkt_rb_node *node;
-
- zone = get_zone(bio->bi_iter.bi_sector, pd);
-
- /*
- * If we find a matching packet in state WAITING or READ_WAIT, we can
- * just append this bio to that packet.
- */
- spin_lock(&pd->cdrw.active_list_lock);
- blocked_bio = 0;
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (pkt->sector == zone) {
- spin_lock(&pkt->lock);
- if ((pkt->state == PACKET_WAITING_STATE) ||
- (pkt->state == PACKET_READ_WAIT_STATE)) {
- bio_list_add(&pkt->orig_bios, bio);
- pkt->write_size +=
- bio->bi_iter.bi_size / CD_FRAMESIZE;
- if ((pkt->write_size >= pkt->frames) &&
- (pkt->state == PACKET_WAITING_STATE)) {
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
- }
- spin_unlock(&pkt->lock);
- spin_unlock(&pd->cdrw.active_list_lock);
- return;
- } else {
- blocked_bio = 1;
- }
- spin_unlock(&pkt->lock);
- }
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-
- /*
- * Test if there is enough room left in the bio work queue
- * (queue size >= congestion on mark).
- * If not, wait till the work queue size is below the congestion off mark.
- */
- spin_lock(&pd->lock);
- if (pd->write_congestion_on > 0
- && pd->bio_queue_size >= pd->write_congestion_on) {
- struct wait_bit_queue_entry wqe;
-
- init_wait_var_entry(&wqe, &pd->congested, 0);
- for (;;) {
- prepare_to_wait_event(__var_waitqueue(&pd->congested),
- &wqe.wq_entry,
- TASK_UNINTERRUPTIBLE);
- if (pd->bio_queue_size <= pd->write_congestion_off)
- break;
- pd->congested = true;
- spin_unlock(&pd->lock);
- schedule();
- spin_lock(&pd->lock);
- }
- }
- spin_unlock(&pd->lock);
-
- /*
- * No matching packet found. Store the bio in the work queue.
- */
- node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
- node->bio = bio;
- spin_lock(&pd->lock);
- BUG_ON(pd->bio_queue_size < 0);
- was_empty = (pd->bio_queue_size == 0);
- pkt_rbtree_insert(pd, node);
- spin_unlock(&pd->lock);
-
- /*
- * Wake up the worker thread.
- */
- atomic_set(&pd->scan_queue, 1);
- if (was_empty) {
- /* This wake_up is required for correct operation */
- wake_up(&pd->wqueue);
- } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
- /*
- * This wake up is not required for correct operation,
- * but improves performance in some cases.
- */
- wake_up(&pd->wqueue);
- }
-}
-
-static void pkt_submit_bio(struct bio *bio)
-{
- struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
- struct device *ddev = disk_to_dev(pd->disk);
- struct bio *split;
-
- bio = bio_split_to_limits(bio);
- if (!bio)
- return;
-
- dev_dbg(ddev, "start = %6llx stop = %6llx\n",
- bio->bi_iter.bi_sector, bio_end_sector(bio));
-
- /*
- * Clone READ bios so we can have our own bi_end_io callback.
- */
- if (bio_data_dir(bio) == READ) {
- pkt_make_request_read(pd, bio);
- return;
- }
-
- if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
- dev_notice(ddev, "WRITE for ro device (%llu)\n", bio->bi_iter.bi_sector);
- goto end_io;
- }
-
- if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
- dev_err(ddev, "wrong bio size\n");
- goto end_io;
- }
-
- do {
- sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
- sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
-
- if (last_zone != zone) {
- BUG_ON(last_zone != zone + pd->settings.size);
-
- split = bio_split(bio, last_zone -
- bio->bi_iter.bi_sector,
- GFP_NOIO, &pkt_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- pkt_make_request_write(split);
- } while (split != bio);
-
- return;
-end_io:
- bio_io_error(bio);
-}
-
-static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
-{
- struct device *ddev = disk_to_dev(pd->disk);
- int i;
- struct file *bdev_file;
- struct scsi_device *sdev;
-
- if (pd->pkt_dev == dev) {
- dev_err(ddev, "recursive setup not allowed\n");
- return -EBUSY;
- }
- for (i = 0; i < MAX_WRITERS; i++) {
- struct pktcdvd_device *pd2 = pkt_devs[i];
- if (!pd2)
- continue;
- if (file_bdev(pd2->bdev_file)->bd_dev == dev) {
- dev_err(ddev, "%pg already setup\n",
- file_bdev(pd2->bdev_file));
- return -EBUSY;
- }
- if (pd2->pkt_dev == dev) {
- dev_err(ddev, "can't chain pktcdvd devices\n");
- return -EBUSY;
- }
- }
-
- bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
- NULL, NULL);
- if (IS_ERR(bdev_file))
- return PTR_ERR(bdev_file);
- sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue);
- if (!sdev) {
- fput(bdev_file);
- return -EINVAL;
- }
- put_device(&sdev->sdev_gendev);
-
- /* This is safe, since we have a reference from open(). */
- __module_get(THIS_MODULE);
-
- pd->bdev_file = bdev_file;
-
- atomic_set(&pd->cdrw.pending_bios, 0);
- pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name);
- if (IS_ERR(pd->cdrw.thread)) {
- dev_err(ddev, "can't start kernel thread\n");
- goto out_mem;
- }
-
- proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd);
- dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file));
- return 0;
-
-out_mem:
- fput(bdev_file);
- /* This is safe: open() is still holding a reference. */
- module_put(THIS_MODULE);
- return -ENOMEM;
-}
-
-static int pkt_ioctl(struct block_device *bdev, blk_mode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct pktcdvd_device *pd = bdev->bd_disk->private_data;
- struct device *ddev = disk_to_dev(pd->disk);
- int ret;
-
- dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-
- mutex_lock(&pktcdvd_mutex);
- switch (cmd) {
- case CDROMEJECT:
- /*
- * The door gets locked when the device is opened, so we
- * have to unlock it or else the eject command fails.
- */
- if (pd->refcnt == 1)
- pkt_lock_door(pd, 0);
- fallthrough;
- /*
- * forward selected CDROM ioctls to CD-ROM, for UDF
- */
- case CDROMMULTISESSION:
- case CDROMREADTOCENTRY:
- case CDROM_LAST_WRITTEN:
- case CDROM_SEND_PACKET:
- case SCSI_IOCTL_SEND_COMMAND:
- if (!bdev->bd_disk->fops->ioctl)
- ret = -ENOTTY;
- else
- ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
- break;
- default:
- dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd);
- ret = -ENOTTY;
- }
- mutex_unlock(&pktcdvd_mutex);
-
- return ret;
-}
-
-static unsigned int pkt_check_events(struct gendisk *disk,
- unsigned int clearing)
-{
- struct pktcdvd_device *pd = disk->private_data;
- struct gendisk *attached_disk;
-
- if (!pd)
- return 0;
- if (!pd->bdev_file)
- return 0;
- attached_disk = file_bdev(pd->bdev_file)->bd_disk;
- if (!attached_disk || !attached_disk->fops->check_events)
- return 0;
- return attached_disk->fops->check_events(attached_disk, clearing);
-}
-
-static char *pkt_devnode(struct gendisk *disk, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name);
-}
-
-static const struct block_device_operations pktcdvd_ops = {
- .owner = THIS_MODULE,
- .submit_bio = pkt_submit_bio,
- .open = pkt_open,
- .release = pkt_release,
- .ioctl = pkt_ioctl,
- .compat_ioctl = blkdev_compat_ptr_ioctl,
- .check_events = pkt_check_events,
- .devnode = pkt_devnode,
-};
-
-/*
- * Set up mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
-{
- struct queue_limits lim = {
- .max_hw_sectors = PACKET_MAX_SECTORS,
- .logical_block_size = CD_FRAMESIZE,
- .features = BLK_FEAT_ROTATIONAL,
- };
- int idx;
- int ret = -ENOMEM;
- struct pktcdvd_device *pd;
- struct gendisk *disk;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- for (idx = 0; idx < MAX_WRITERS; idx++)
- if (!pkt_devs[idx])
- break;
- if (idx == MAX_WRITERS) {
- pr_err("max %d writers supported\n", MAX_WRITERS);
- ret = -EBUSY;
- goto out_mutex;
- }
-
- pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
- if (!pd)
- goto out_mutex;
-
- ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
- sizeof(struct pkt_rb_node));
- if (ret)
- goto out_mem;
-
- INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
- INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
- spin_lock_init(&pd->cdrw.active_list_lock);
-
- spin_lock_init(&pd->lock);
- spin_lock_init(&pd->iosched.lock);
- bio_list_init(&pd->iosched.read_queue);
- bio_list_init(&pd->iosched.write_queue);
- init_waitqueue_head(&pd->wqueue);
- pd->bio_queue = RB_ROOT;
-
- pd->write_congestion_on = write_congestion_on;
- pd->write_congestion_off = write_congestion_off;
-
- disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
- if (IS_ERR(disk)) {
- ret = PTR_ERR(disk);
- goto out_mem;
- }
- pd->disk = disk;
- disk->major = pktdev_major;
- disk->first_minor = idx;
- disk->minors = 1;
- disk->fops = &pktcdvd_ops;
- disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART;
- snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx);
- disk->private_data = pd;
-
- pd->pkt_dev = MKDEV(pktdev_major, idx);
- ret = pkt_new_dev(pd, dev);
- if (ret)
- goto out_mem2;
-
- /* inherit events of the host device */
- disk->events = file_bdev(pd->bdev_file)->bd_disk->events;
-
- ret = add_disk(disk);
- if (ret)
- goto out_mem2;
-
- pkt_sysfs_dev_new(pd);
- pkt_debugfs_dev_new(pd);
-
- pkt_devs[idx] = pd;
- if (pkt_dev)
- *pkt_dev = pd->pkt_dev;
-
- mutex_unlock(&ctl_mutex);
- return 0;
-
-out_mem2:
- put_disk(disk);
-out_mem:
- mempool_exit(&pd->rb_pool);
- kfree(pd);
-out_mutex:
- mutex_unlock(&ctl_mutex);
- pr_err("setup of pktcdvd device failed\n");
- return ret;
-}
-
-/*
- * Tear down mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_remove_dev(dev_t pkt_dev)
-{
- struct pktcdvd_device *pd;
- struct device *ddev;
- int idx;
- int ret = 0;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- for (idx = 0; idx < MAX_WRITERS; idx++) {
- pd = pkt_devs[idx];
- if (pd && (pd->pkt_dev == pkt_dev))
- break;
- }
- if (idx == MAX_WRITERS) {
- pr_debug("dev not setup\n");
- ret = -ENXIO;
- goto out;
- }
-
- if (pd->refcnt > 0) {
- ret = -EBUSY;
- goto out;
- }
-
- ddev = disk_to_dev(pd->disk);
-
- if (!IS_ERR(pd->cdrw.thread))
- kthread_stop(pd->cdrw.thread);
-
- pkt_devs[idx] = NULL;
-
- pkt_debugfs_dev_remove(pd);
- pkt_sysfs_dev_remove(pd);
-
- fput(pd->bdev_file);
-
- remove_proc_entry(pd->disk->disk_name, pkt_proc);
- dev_notice(ddev, "writer unmapped\n");
-
- del_gendisk(pd->disk);
- put_disk(pd->disk);
-
- mempool_exit(&pd->rb_pool);
- kfree(pd);
-
- /* This is safe: open() is still holding a reference. */
- module_put(THIS_MODULE);
-
-out:
- mutex_unlock(&ctl_mutex);
- return ret;
-}
-
-static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
-{
- struct pktcdvd_device *pd;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
- if (pd) {
- ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev);
- ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
- } else {
- ctrl_cmd->dev = 0;
- ctrl_cmd->pkt_dev = 0;
- }
- ctrl_cmd->num_devices = MAX_WRITERS;
-
- mutex_unlock(&ctl_mutex);
-}
-
-static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- struct pkt_ctrl_command ctrl_cmd;
- int ret = 0;
- dev_t pkt_dev = 0;
-
- if (cmd != PACKET_CTRL_CMD)
- return -ENOTTY;
-
- if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
- return -EFAULT;
-
- switch (ctrl_cmd.command) {
- case PKT_CTRL_CMD_SETUP:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev);
- ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev);
- break;
- case PKT_CTRL_CMD_TEARDOWN:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev));
- break;
- case PKT_CTRL_CMD_STATUS:
- pkt_get_status(&ctrl_cmd);
- break;
- default:
- return -ENOTTY;
- }
-
- if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
- return -EFAULT;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
-static const struct file_operations pkt_ctl_fops = {
- .open = nonseekable_open,
- .unlocked_ioctl = pkt_ctl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = pkt_ctl_compat_ioctl,
-#endif
- .owner = THIS_MODULE,
-};
-
-static struct miscdevice pkt_misc = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = DRIVER_NAME,
- .nodename = "pktcdvd/control",
- .fops = &pkt_ctl_fops
-};
-
-static int __init pkt_init(void)
-{
- int ret;
-
- mutex_init(&ctl_mutex);
-
- ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
- sizeof(struct packet_stacked_data));
- if (ret)
- return ret;
- ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
- if (ret) {
- mempool_exit(&psd_pool);
- return ret;
- }
-
- ret = register_blkdev(pktdev_major, DRIVER_NAME);
- if (ret < 0) {
- pr_err("unable to register block device\n");
- goto out2;
- }
- if (!pktdev_major)
- pktdev_major = ret;
-
- ret = pkt_sysfs_init();
- if (ret)
- goto out;
-
- pkt_debugfs_init();
-
- ret = misc_register(&pkt_misc);
- if (ret) {
- pr_err("unable to register misc device\n");
- goto out_misc;
- }
-
- pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL);
-
- return 0;
-
-out_misc:
- pkt_debugfs_cleanup();
- pkt_sysfs_cleanup();
-out:
- unregister_blkdev(pktdev_major, DRIVER_NAME);
-out2:
- mempool_exit(&psd_pool);
- bioset_exit(&pkt_bio_set);
- return ret;
-}
-
-static void __exit pkt_exit(void)
-{
- remove_proc_entry("driver/"DRIVER_NAME, NULL);
- misc_deregister(&pkt_misc);
-
- pkt_debugfs_cleanup();
- pkt_sysfs_cleanup();
-
- unregister_blkdev(pktdev_major, DRIVER_NAME);
- mempool_exit(&psd_pool);
- bioset_exit(&pkt_bio_set);
-}
-
-MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
-MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
-MODULE_LICENSE("GPL");
-
-module_init(pkt_init);
-module_exit(pkt_exit);
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 2ee6e9bd4e28..2df8941a6b14 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
- if (bio_add_page(bio, virt_to_page(data), datalen,
- offset_in_page(data)) != datalen) {
- rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
- err = -EINVAL;
- goto bio_put;
- }
+ bio_add_virt_nofail(bio, data, datalen);
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b5727dea15bd..7af21fe67671 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -957,8 +957,10 @@ static bool vdc_port_mpgroup_check(struct vio_dev *vdev)
dev = device_find_child(vdev->dev.parent, &port_data,
vdc_device_probed);
- if (dev)
+ if (dev) {
+ put_device(dev);
return true;
+ }
return false;
}
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index ee6cade70222..01f7aef3fcfb 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -555,7 +555,7 @@ static void act(struct floppy_state *fs)
static void scan_timeout(struct timer_list *t)
{
- struct floppy_state *fs = from_timer(fs, t, timeout);
+ struct floppy_state *fs = timer_container_of(fs, t, timeout);
struct swim3 __iomem *sw = fs->swim3;
unsigned long flags;
@@ -579,7 +579,7 @@ static void scan_timeout(struct timer_list *t)
static void seek_timeout(struct timer_list *t)
{
- struct floppy_state *fs = from_timer(fs, t, timeout);
+ struct floppy_state *fs = timer_container_of(fs, t, timeout);
struct swim3 __iomem *sw = fs->swim3;
unsigned long flags;
@@ -598,7 +598,7 @@ static void seek_timeout(struct timer_list *t)
static void settle_timeout(struct timer_list *t)
{
- struct floppy_state *fs = from_timer(fs, t, timeout);
+ struct floppy_state *fs = timer_container_of(fs, t, timeout);
struct swim3 __iomem *sw = fs->swim3;
unsigned long flags;
@@ -627,7 +627,7 @@ static void settle_timeout(struct timer_list *t)
static void xfer_timeout(struct timer_list *t)
{
- struct floppy_state *fs = from_timer(fs, t, timeout);
+ struct floppy_state *fs = timer_container_of(fs, t, timeout);
struct swim3 __iomem *sw = fs->swim3;
struct dbdma_regs __iomem *dr = fs->dma;
unsigned long flags;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index dc104c025cd5..99abd67b708b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -48,8 +48,12 @@
#define UBLK_MINORS (1U << MINORBITS)
+#define UBLK_INVALID_BUF_IDX ((u16)-1)
+
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
+#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -64,7 +68,12 @@
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
| UBLK_F_ZONED \
- | UBLK_F_USER_RECOVERY_FAIL_IO)
+ | UBLK_F_USER_RECOVERY_FAIL_IO \
+ | UBLK_F_UPDATE_SIZE \
+ | UBLK_F_AUTO_BUF_REG \
+ | UBLK_F_QUIESCE \
+ | UBLK_F_PER_IO_DAEMON \
+ | UBLK_F_BUF_REG_OFF_DAEMON)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -76,10 +85,6 @@
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
-struct ublk_rq_data {
- struct kref ref;
-};
-
struct ublk_uring_cmd_pdu {
/*
* Store requests in same batch temporarily for queuing them to
@@ -99,6 +104,7 @@ struct ublk_uring_cmd_pdu {
* setup in ublk uring_cmd handler
*/
struct ublk_queue *ubq;
+
u16 tag;
};
@@ -131,28 +137,68 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/*
+ * request buffer is registered automatically, so we have to unregister it
+ * before completing this request.
+ *
+ * io_uring will unregister buffer automatically for us during exiting.
+ */
+#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
+
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
+/*
+ * Initialize refcount to a large number to include any registered buffers.
+ * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
+ * any buffers registered on the io daemon task.
+ */
+#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+
struct ublk_io {
/* userspace buffer address from io cmd */
- __u64 addr;
+ union {
+ __u64 addr;
+ struct ublk_auto_buf_reg buf;
+ };
unsigned int flags;
int res;
- struct io_uring_cmd *cmd;
-};
+ union {
+ /* valid if UBLK_IO_FLAG_ACTIVE is set */
+ struct io_uring_cmd *cmd;
+ /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
+ struct request *req;
+ };
+
+ struct task_struct *task;
+
+ /*
+ * The number of uses of this I/O by the ublk server
+ * if user copy or zero copy are enabled:
+ * - UBLK_REFCOUNT_INIT from dispatch to the server
+ * until UBLK_IO_COMMIT_AND_FETCH_REQ
+ * - 1 for each inflight ublk_ch_{read,write}_iter() call
+ * - 1 for each io_uring registered buffer not registered on task
+ * The I/O can only be completed once all references are dropped.
+ * User copy and buffer registration operations are only permitted
+ * if the reference count is nonzero.
+ */
+ refcount_t ref;
+ /* Count of buffers registered on task and not yet unregistered */
+ unsigned task_registered_buffers;
+
+ void *buf_ctx_handle;
+} ____cacheline_aligned_in_smp;
struct ublk_queue {
int q_id;
int q_depth;
unsigned long flags;
- struct task_struct *ubq_daemon;
struct ublksrv_io_desc *io_cmd_buf;
bool force_abort;
- bool timeout;
bool canceling;
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
unsigned short nr_io_ready; /* how many ios setup */
@@ -189,7 +235,10 @@ struct ublk_device {
struct completion completion;
unsigned int nr_queues_ready;
- unsigned int nr_privileged_daemon;
+ bool unprivileged_daemons;
+ struct mutex cancel_mutex;
+ bool canceling;
+ pid_t ublksrv_tgid;
};
/* header of ublk_params */
@@ -198,13 +247,20 @@ struct ublk_params_header {
__u32 types;
};
+static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, int tag, size_t offset);
+ const struct ublk_queue *ubq, struct ublk_io *io,
+ size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag);
+
+static inline struct ublksrv_io_desc *
+ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
+{
+ return &ubq->io_cmd_buf[tag];
+}
+
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -356,8 +412,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
if (ret)
goto free_req;
- ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
- GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
if (ret)
goto erase_desc;
@@ -477,7 +532,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req);
-static void ublk_complete_rq(struct kref *ref);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -609,6 +663,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
@@ -616,7 +675,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
- return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
+ return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
+ !ublk_support_auto_buf_reg(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -627,42 +687,39 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
+ *
+ * For auto buffer register, ublk server still may issue
+ * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
+ * so reference is required too.
*/
- return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
+ return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
+ ublk_support_auto_buf_reg(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+ struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- kref_init(&data->ref);
- }
+ if (ublk_need_req_ref(ubq))
+ refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
}
-static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline bool ublk_get_req_ref(struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- return kref_get_unless_zero(&data->ref);
- }
+ return refcount_inc_not_zero(&io->ref);
+}
- return true;
+static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
+{
+ if (refcount_dec_and_test(&io->ref))
+ __ublk_complete_rq(req);
}
-static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline bool ublk_sub_req_ref(struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
- kref_put(&data->ref, ublk_complete_rq);
- } else {
- __ublk_complete_rq(req);
- }
+ io->task_registered_buffers = 0;
+ return refcount_sub_and_test(sub_refs, &io->ref);
}
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
@@ -695,12 +752,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
return bio_has_data(rq->bio);
}
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag)
-{
- return &ubq->io_cmd_buf[tag];
-}
-
static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
@@ -945,7 +996,7 @@ static inline bool ublk_need_unmap_req(const struct request *req)
}
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
- struct ublk_io *io)
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -969,7 +1020,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
static int ublk_unmap_io(const struct ublk_queue *ubq,
const struct request *req,
- struct ublk_io *io)
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -1064,11 +1115,6 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
}
-static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
-{
- return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING;
-}
-
/* todo: handle partial completion */
static inline void __ublk_complete_rq(struct request *req)
{
@@ -1109,7 +1155,7 @@ static inline void __ublk_complete_rq(struct request *req)
if (blk_update_request(req, BLK_STS_OK, io->res))
blk_mq_requeue_request(req, true);
- else
+ else if (likely(!blk_should_fake_timeout(req->q)))
__blk_mq_end_request(req, BLK_STS_OK);
return;
@@ -1117,18 +1163,12 @@ exit:
blk_mq_end_request(req, res);
}
-static void ublk_complete_rq(struct kref *ref)
+static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
+ struct request *req)
{
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
-
- __ublk_complete_rq(req);
-}
+ /* read cmd first because req will overwrite it */
+ struct io_uring_cmd *cmd = io->cmd;
-static void ubq_complete_io_cmd(struct ublk_io *io, int res,
- unsigned issue_flags)
-{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
@@ -1138,8 +1178,17 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+ io->req = req;
+ return cmd;
+}
+
+static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
+ int res, unsigned issue_flags)
+{
+ struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
+
/* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, res, 0, issue_flags);
+ io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1154,28 +1203,97 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
+static void
+ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
+{
+ unsigned tag = io - ubq->ios;
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
+
+ iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
+}
+
+static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, unsigned int issue_flags)
+{
+ int ret;
+
+ ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
+ io->buf.index, issue_flags);
+ if (ret) {
+ if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(ubq, io);
+ return true;
+ }
+ blk_mq_end_request(req, BLK_STS_IOERR);
+ return false;
+ }
+
+ io->task_registered_buffers = 1;
+ io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ return true;
+}
+
+static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ ublk_init_req_ref(ubq, io);
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
+ return ublk_auto_buf_reg(ubq, req, io, issue_flags);
+
+ return true;
+}
+
+static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io)
+{
+ unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+
+ /* partially mapped, update io descriptor */
+ if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+ /*
+ * Nothing mapped, retry until we succeed.
+ *
+ * We may never succeed in mapping any bytes here because
+ * of OOM. TODO: reserve one buffer with single page pinned
+ * for providing forward progress guarantee.
+ */
+ if (unlikely(!mapped_bytes)) {
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q,
+ UBLK_REQUEUE_DELAY_MS);
+ return false;
+ }
+
+ ublk_get_iod(ubq, req->tag)->nr_sectors =
+ mapped_bytes >> 9;
+ }
+
+ return true;
+}
+
static void ublk_dispatch_req(struct ublk_queue *ubq,
struct request *req,
unsigned int issue_flags)
{
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
- unsigned int mapped_bytes;
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+ pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
/*
* Task is exiting if either:
*
- * (1) current != ubq_daemon.
+ * (1) current != io->task.
* io_uring_cmd_complete_in_task() tries to run task_work
- * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
+ * in a workqueue if cmd's task is PF_EXITING.
*
* (2) current->flags & PF_EXITING.
*/
- if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
+ if (unlikely(current != io->task || current->flags & PF_EXITING)) {
__ublk_abort_rq(ubq, req);
return;
}
@@ -1183,54 +1301,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
- * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+ * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
- if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
- io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
- pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
- __func__, io->cmd->cmd_op, ubq->q_id,
- req->tag, io->flags);
- ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
- return;
- }
- /*
- * We have handled UBLK_IO_NEED_GET_DATA command,
- * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
- * do the copy work.
- */
- io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
- pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+ pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
+ __func__, ubq->q_id, req->tag, io->flags);
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
+ issue_flags);
+ return;
}
- mapped_bytes = ublk_map_io(ubq, req, io);
-
- /* partially mapped, update io descriptor */
- if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
- /*
- * Nothing mapped, retry until we succeed.
- *
- * We may never succeed in mapping any bytes here because
- * of OOM. TODO: reserve one buffer with single page pinned
- * for providing forward progress guarantee.
- */
- if (unlikely(!mapped_bytes)) {
- blk_mq_requeue_request(req, false);
- blk_mq_delay_kick_requeue_list(req->q,
- UBLK_REQUEUE_DELAY_MS);
- return;
- }
-
- ublk_get_iod(ubq, req->tag)->nr_sectors =
- mapped_bytes >> 9;
- }
+ if (!ublk_start_io(ubq, req, io))
+ return;
- ublk_init_req_ref(ubq, req);
- ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
+ if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
@@ -1256,24 +1342,22 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct request *rq = pdu->req_list;
- struct ublk_queue *ubq = pdu->ubq;
struct request *next;
do {
next = rq->rq_next;
rq->rq_next = NULL;
- ublk_dispatch_req(ubq, rq, issue_flags);
+ ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
rq = next;
} while (rq);
}
-static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
+static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
{
- struct request *rq = rq_list_peek(l);
- struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+ struct io_uring_cmd *cmd = io->cmd;
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- pdu->req_list = rq;
+ pdu->req_list = rq_list_peek(l);
rq_list_init(l);
io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
}
@@ -1281,17 +1365,23 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ pid_t tgid = ubq->dev->ublksrv_tgid;
+ struct task_struct *p;
+ struct pid *pid;
- if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
- if (!ubq->timeout) {
- send_sig(SIGKILL, ubq->ubq_daemon, 0);
- ubq->timeout = true;
- }
+ if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
+ return BLK_EH_RESET_TIMER;
- return BLK_EH_DONE;
- }
+ if (unlikely(!tgid))
+ return BLK_EH_RESET_TIMER;
- return BLK_EH_RESET_TIMER;
+ rcu_read_lock();
+ pid = find_vpid(tgid);
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ send_sig(SIGKILL, p, 0);
+ rcu_read_unlock();
+ return BLK_EH_DONE;
}
static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
@@ -1299,7 +1389,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
{
blk_status_t res;
- if (unlikely(ubq->fail_io))
+ if (unlikely(READ_ONCE(ubq->fail_io)))
return BLK_STS_TARGET;
/* With recovery feature enabled, force_abort is set in
@@ -1311,7 +1401,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
* Note: force_abort is guaranteed to be seen because it is set
* before request queue is unqiuesced.
*/
- if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
+ if (ublk_nosrv_should_queue_io(ubq) &&
+ unlikely(READ_ONCE(ubq->force_abort)))
return BLK_STS_IOERR;
if (check_cancel && unlikely(ubq->canceling))
@@ -1351,28 +1442,39 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
+static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
+ const struct ublk_io *io2)
+{
+ return (io_uring_cmd_ctx_handle(io->cmd) ==
+ io_uring_cmd_ctx_handle(io2->cmd)) &&
+ (io->task == io2->task);
+}
+
static void ublk_queue_rqs(struct rq_list *rqlist)
{
struct rq_list requeue_list = { };
struct rq_list submit_list = { };
- struct ublk_queue *ubq = NULL;
+ struct ublk_io *io = NULL;
struct request *req;
while ((req = rq_list_pop(rqlist))) {
struct ublk_queue *this_q = req->mq_hctx->driver_data;
+ struct ublk_io *this_io = &this_q->ios[req->tag];
- if (ubq && ubq != this_q && !rq_list_empty(&submit_list))
- ublk_queue_cmd_list(ubq, &submit_list);
- ubq = this_q;
-
- if (ublk_prep_req(ubq, req, true) == BLK_STS_OK)
- rq_list_add_tail(&submit_list, req);
- else
+ if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
rq_list_add_tail(&requeue_list, req);
+ continue;
+ }
+
+ if (io && !ublk_belong_to_same_batch(io, this_io) &&
+ !rq_list_empty(&submit_list))
+ ublk_queue_cmd_list(io, &submit_list);
+ io = this_io;
+ rq_list_add_tail(&submit_list, req);
}
- if (ubq && !rq_list_empty(&submit_list))
- ublk_queue_cmd_list(ubq, &submit_list);
+ if (!rq_list_empty(&submit_list))
+ ublk_queue_cmd_list(io, &submit_list);
*rqlist = requeue_list;
}
@@ -1400,17 +1502,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
/* All old ioucmds have to be completed */
ubq->nr_io_ready = 0;
- /*
- * old daemon is PF_EXITING, put it now
- *
- * It could be NULL in case of closing one quisced device.
- */
- if (ubq->ubq_daemon)
- put_task_struct(ubq->ubq_daemon);
- /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
- ubq->ubq_daemon = NULL;
- ubq->timeout = false;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1421,6 +1512,20 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
io->flags &= UBLK_IO_FLAG_CANCELED;
io->cmd = NULL;
io->addr = 0;
+
+ /*
+ * old task is PF_EXITING, put it now
+ *
+ * It could be NULL in case of closing one quiesced
+ * device.
+ */
+ if (io->task) {
+ put_task_struct(io->task);
+ io->task = NULL;
+ }
+
+ WARN_ON_ONCE(refcount_read(&io->ref));
+ WARN_ON_ONCE(io->task_registered_buffers);
}
}
@@ -1432,6 +1537,7 @@ static int ublk_ch_open(struct inode *inode, struct file *filp)
if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
return -EBUSY;
filp->private_data = ub;
+ ub->ublksrv_tgid = current->tgid;
return 0;
}
@@ -1442,10 +1548,11 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
- /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
+ /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
ub->mm = NULL;
ub->nr_queues_ready = 0;
- ub->nr_privileged_daemon = 0;
+ ub->unprivileged_daemons = false;
+ ub->ublksrv_tgid = -1;
}
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
@@ -1467,6 +1574,27 @@ static void ublk_put_disk(struct gendisk *disk)
put_device(disk_to_dev(disk));
}
+/*
+ * Use this function to ensure that ->canceling is consistently set for
+ * the device and all queues. Do not set these flags directly.
+ *
+ * Caller must ensure that:
+ * - cancel_mutex is held. This ensures that there is no concurrent
+ * access to ub->canceling and no concurrent writes to ubq->canceling.
+ * - there are no concurrent reads of ubq->canceling from the queue_rq
+ * path. This can be done by quiescing the queue, or through other
+ * means.
+ */
+static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
+ __must_hold(&ub->cancel_mutex)
+{
+ int i;
+
+ ub->canceling = canceling;
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_get_queue(ub, i)->canceling = canceling;
+}
+
static int ublk_ch_release(struct inode *inode, struct file *filp)
{
struct ublk_device *ub = filp->private_data;
@@ -1495,12 +1623,11 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* All requests may be inflight, so ->canceling may not be set, set
* it now.
*/
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
-
- ubq->canceling = true;
- ublk_abort_queue(ub, ubq);
- }
+ mutex_lock(&ub->cancel_mutex);
+ ublk_set_canceling(ub, true);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_abort_queue(ub, ublk_get_queue(ub, i));
+ mutex_unlock(&ub->cancel_mutex);
blk_mq_kick_requeue_list(disk->queue);
/*
@@ -1518,7 +1645,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* Transition the device to the nosrv state. What exactly this
* means depends on the recovery flags
*/
- blk_mq_quiesce_queue(disk->queue);
if (ublk_nosrv_should_stop_dev(ub)) {
/*
* Allow any pending/future I/O to pass through quickly
@@ -1526,8 +1652,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* waits for all pending I/O to complete
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_get_queue(ub, i)->force_abort = true;
- blk_mq_unquiesce_queue(disk->queue);
+ WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
ublk_stop_dev_unlocked(ub);
} else {
@@ -1537,9 +1662,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
} else {
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_get_queue(ub, i)->fail_io = true;
+ WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
}
- blk_mq_unquiesce_queue(disk->queue);
}
unlock:
mutex_unlock(&ub->mutex);
@@ -1590,30 +1714,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void ublk_commit_completion(struct ublk_device *ub,
- const struct ublksrv_io_cmd *ub_cmd)
-{
- u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
- struct ublk_queue *ubq = ublk_get_queue(ub, qid);
- struct ublk_io *io = &ubq->ios[tag];
- struct request *req;
-
- /* now this cmd slot is owned by nbd driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
-
- /* find the io request and complete */
- req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
- if (WARN_ON_ONCE(unlikely(!req)))
- return;
-
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
-
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
-}
-
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@@ -1642,37 +1742,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
-
- /*
- * Either we fail the request or ublk_rq_task_work_cb
- * will do it
- */
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq))
- __ublk_fail_req(ubq, io, rq);
- }
+ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+ __ublk_fail_req(ubq, io, io->req);
}
}
-/* Must be called when queue is frozen */
-static void ublk_mark_queue_canceling(struct ublk_queue *ubq)
+static void ublk_start_cancel(struct ublk_device *ub)
{
- spin_lock(&ubq->cancel_lock);
- if (!ubq->canceling)
- ubq->canceling = true;
- spin_unlock(&ubq->cancel_lock);
-}
-
-static void ublk_start_cancel(struct ublk_queue *ubq)
-{
- struct ublk_device *ub = ubq->dev;
struct gendisk *disk = ublk_get_disk(ub);
/* Our disk has been dead */
if (!disk)
return;
+
+ mutex_lock(&ub->cancel_mutex);
+ if (ub->canceling)
+ goto out;
/*
* Now we are serialized with ublk_queue_rq()
*
@@ -1681,8 +1766,10 @@ static void ublk_start_cancel(struct ublk_queue *ubq)
* touch completed uring_cmd
*/
blk_mq_quiesce_queue(disk->queue);
- ublk_mark_queue_canceling(ubq);
+ ublk_set_canceling(ub, true);
blk_mq_unquiesce_queue(disk->queue);
+out:
+ mutex_unlock(&ub->cancel_mutex);
ublk_put_disk(disk);
}
@@ -1742,6 +1829,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->ubq;
struct task_struct *task;
+ struct ublk_io *io;
if (WARN_ON_ONCE(!ubq))
return;
@@ -1750,13 +1838,13 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
return;
task = io_uring_cmd_get_task(cmd);
- if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
+ io = &ubq->ios[pdu->tag];
+ if (WARN_ON_ONCE(task && task != io->task))
return;
- if (!ubq->canceling)
- ublk_start_cancel(ubq);
+ ublk_start_cancel(ubq->dev);
- WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd);
+ WARN_ON_ONCE(io->cmd != cmd);
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
@@ -1878,9 +1966,11 @@ static void ublk_reset_io_flags(struct ublk_device *ub)
for (j = 0; j < ubq->q_depth; j++)
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
spin_unlock(&ubq->cancel_lock);
- ubq->canceling = false;
ubq->fail_io = false;
}
+ mutex_lock(&ub->cancel_mutex);
+ ublk_set_canceling(ub, false);
+ mutex_unlock(&ub->cancel_mutex);
}
/* device can only be started after all IOs are ready */
@@ -1888,14 +1978,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
__must_hold(&ub->mutex)
{
ubq->nr_io_ready++;
- if (ublk_queue_ready(ubq)) {
- ubq->ubq_daemon = current;
- get_task_struct(ubq->ubq_daemon);
+ if (ublk_queue_ready(ubq))
ub->nr_queues_ready++;
-
- if (capable(CAP_SYS_ADMIN))
- ub->nr_privileged_daemon++;
- }
+ if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
+ ub->unprivileged_daemons = true;
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
/* now we are ready for handling ublk io request */
@@ -1917,12 +2003,66 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
return 0;
}
-static inline void ublk_fill_io_cmd(struct ublk_io *io,
- struct io_uring_cmd *cmd, unsigned long buf_addr)
+static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
+{
+ io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (io->buf.reserved0 || io->buf.reserved1)
+ return -EINVAL;
+
+ if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ return -EINVAL;
+ return 0;
+}
+
+static int ublk_handle_auto_buf_reg(struct ublk_io *io,
+ struct io_uring_cmd *cmd,
+ u16 *buf_idx)
{
+ if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
+ io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
+
+ /*
+ * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
+ * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
+ * `io_ring_ctx`.
+ *
+ * If this uring_cmd's io_ring_ctx isn't same with the
+ * one for registering the buffer, it is ublk server's
+ * responsibility for unregistering the buffer, otherwise
+ * this ublk request gets stuck.
+ */
+ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
+ *buf_idx = io->buf.index;
+ }
+
+ return ublk_set_auto_buf_reg(io, cmd);
+}
+
+/* Once we return, `io->req` can't be used any more */
+static inline struct request *
+ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
+{
+ struct request *req = io->req;
+
io->cmd = cmd;
io->flags |= UBLK_IO_FLAG_ACTIVE;
+ /* now this cmd slot is owned by ublk driver */
+ io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+
+ return req;
+}
+
+static inline int
+ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
+ struct io_uring_cmd *cmd, unsigned long buf_addr,
+ u16 *buf_idx)
+{
+ if (ublk_support_auto_buf_reg(ubq))
+ return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
+
io->addr = buf_addr;
+ return 0;
}
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
@@ -1944,54 +2084,99 @@ static void ublk_io_release(void *priv)
{
struct request *rq = priv;
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ struct ublk_io *io = &ubq->ios[rq->tag];
- ublk_put_req_ref(ubq, rq);
+ /*
+ * task_registered_buffers may be 0 if buffers were registered off task
+ * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
+ */
+ if (current == io->task && io->task_registered_buffers)
+ io->task_registered_buffers--;
+ else
+ ublk_put_req_ref(io, rq);
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
+ const struct ublk_queue *ubq,
+ struct ublk_io *io,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
- const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
- req = __ublk_check_and_get_req(ub, ubq, tag, 0);
+ req = __ublk_check_and_get_req(ub, ubq, io, 0);
if (!req)
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
if (ret) {
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
return 0;
}
-static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
- unsigned int index, unsigned int issue_flags)
+static int
+ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
+ const struct ublk_queue *ubq, struct ublk_io *io,
+ unsigned index, unsigned issue_flags)
{
- const struct ublk_io *io = &ubq->ios[tag];
+ unsigned new_registered_buffers;
+ struct request *req = io->req;
+ int ret;
- if (!ublk_support_zero_copy(ubq))
+ /*
+ * Ensure there are still references for ublk_sub_req_ref() to release.
+ * If not, fall back on the thread-safe buffer registration.
+ */
+ new_registered_buffers = io->task_registered_buffers + 1;
+ if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
+ return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
+
+ if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+ ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
+ issue_flags);
+ if (ret)
+ return ret;
+
+ io->task_registered_buffers = new_registered_buffers;
+ return 0;
+}
+
+static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
+ const struct ublk_device *ub,
+ unsigned int index, unsigned int issue_flags)
+{
+ if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
return -EINVAL;
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
+static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
+{
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!buf_addr && !ublk_need_get_data(ubq))
+ return -EINVAL;
+ } else if (buf_addr) {
+ /* User copy requires addr to be unset */
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
struct ublk_io *io, __u64 buf_addr)
{
@@ -2018,59 +2203,135 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
- if (ublk_need_map_io(ubq)) {
- /*
- * FETCH_RQ has to provide IO buffer if NEED GET
- * DATA is not enabled
- */
- if (!buf_addr && !ublk_need_get_data(ubq))
- goto out;
- } else if (buf_addr) {
- /* User copy requires addr to be unset */
- ret = -EINVAL;
+ ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
+ if (ret)
goto out;
- }
- ublk_fill_io_cmd(io, cmd, buf_addr);
+ WRITE_ONCE(io->task, get_task_struct(current));
ublk_mark_io_ready(ub, ubq);
out:
mutex_unlock(&ub->mutex);
return ret;
}
+static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
+ struct ublk_io *io, __u64 buf_addr)
+{
+ struct request *req = io->req;
+
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!buf_addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ return -EINVAL;
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
+ /*
+ * User copy requires addr to be unset when command is
+ * not zone append
+ */
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool ublk_need_complete_req(const struct ublk_queue *ubq,
+ struct ublk_io *io)
+{
+ if (ublk_need_req_ref(ubq))
+ return ublk_sub_req_ref(io);
+ return true;
+}
+
+static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
+ struct request *req)
+{
+ /*
+ * We have handled UBLK_IO_NEED_GET_DATA command,
+ * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+ * do the copy work.
+ */
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+ /* update iod->addr because ublksrv may have passed a new io buffer */
+ ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
+ ublk_get_iod(ubq, req->tag)->addr);
+
+ return ublk_start_io(ubq, req, io);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
{
+ u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
- int ret = -EINVAL;
struct request *req;
+ int ret;
+ bool compl;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
ub_cmd->result);
- if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+ ret = ublk_check_cmd_op(cmd_op);
+ if (ret)
goto out;
- ubq = ublk_get_queue(ub, ub_cmd->q_id);
- if (!ubq || ub_cmd->q_id != ubq->q_id)
- goto out;
+ /*
+ * io_buffer_unregister_bvec() doesn't access the ubq or io,
+ * so no need to validate the q_id, tag, or task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
+ return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
+ issue_flags);
- if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+ ret = -EINVAL;
+ if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
goto out;
+ ubq = ublk_get_queue(ub, ub_cmd->q_id);
+
if (tag >= ubq->q_depth)
goto out;
io = &ubq->ios[tag];
+ /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
+ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
+ ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
+ if (ret)
+ goto out;
+ ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ if (ret)
+ goto out;
+
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
+ return -EIOCBQUEUED;
+ }
+
+ if (READ_ONCE(io->task) != current) {
+ /*
+ * ublk_register_io_buf() accesses only the io's refcount,
+ * so can be handled on any task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
+ return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ issue_flags);
+
+ goto out;
+ }
/* there is pending io cmd, something must be wrong */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
ret = -EBUSY;
goto out;
}
@@ -2083,54 +2344,44 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
- ret = ublk_check_cmd_op(cmd_op);
- if (ret)
- goto out;
-
- ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
- case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
- case UBLK_IO_FETCH_REQ:
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
- if (ret)
- goto out;
- break;
+ return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
-
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+ ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
+ if (ret)
goto out;
+ io->res = ub_cmd->result;
+ req = ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
+ compl = ublk_need_complete_req(ubq, io);
+
+ /* can't touch 'ublk_io' any more */
+ if (buf_idx != UBLK_INVALID_BUF_IDX)
+ io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+ if (compl)
+ __ublk_complete_rq(req);
- if (ublk_need_map_io(ubq)) {
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if
- * NEED GET DATA is not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
- req_op(req) == REQ_OP_READ))
- goto out;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
- /*
- * User copy requires addr to be unset when command is
- * not zone append
- */
- ret = -EINVAL;
+ if (ret)
goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- ublk_dispatch_req(ubq, req, issue_flags);
- return -EIOCBQUEUED;
+ /*
+ * ublk_get_data() may fail and fallback to requeue, so keep
+ * uring_cmd active first and prepare for handling new requeued
+ * request
+ */
+ req = ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
+ WARN_ON_ONCE(ret);
+ if (likely(ublk_get_data(ubq, io, req))) {
+ __ublk_prep_compl_io_cmd(io, req);
+ return UBLK_IO_RES_OK;
+ }
+ break;
default:
goto out;
}
@@ -2144,15 +2395,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, int tag, size_t offset)
+ const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
{
+ unsigned tag = io - ubq->ios;
struct request *req;
+ /*
+ * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
+ * which would overwrite it with io->cmd
+ */
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
if (!req)
return NULL;
- if (!ublk_get_req_ref(ubq, req))
+ if (!ublk_get_req_ref(io))
return NULL;
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
@@ -2166,7 +2422,7 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
return req;
fail_put:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return NULL;
}
@@ -2233,7 +2489,8 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
}
static struct request *ublk_check_and_get_req(struct kiocb *iocb,
- struct iov_iter *iter, size_t *off, int dir)
+ struct iov_iter *iter, size_t *off, int dir,
+ struct ublk_io **io)
{
struct ublk_device *ub = iocb->ki_filp->private_data;
struct ublk_queue *ubq;
@@ -2267,7 +2524,8 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
if (tag >= ubq->q_depth)
return ERR_PTR(-EINVAL);
- req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
+ *io = &ubq->ios[tag];
+ req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
if (!req)
return ERR_PTR(-EINVAL);
@@ -2280,42 +2538,40 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
*off = buf_off;
return req;
fail:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(*io, req);
return ERR_PTR(-EACCES);
}
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct ublk_queue *ubq;
struct request *req;
+ struct ublk_io *io;
size_t buf_off;
size_t ret;
- req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
+ req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct ublk_queue *ubq;
struct request *req;
+ struct ublk_io *io;
size_t buf_off;
size_t ret;
- req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
+ req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
@@ -2334,9 +2590,16 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
int size = ublk_queue_cmd_buf_size(ub, q_id);
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+ int i;
+
+ for (i = 0; i < ubq->q_depth; i++) {
+ struct ublk_io *io = &ubq->ios[i];
+ if (io->task)
+ put_task_struct(io->task);
+ WARN_ON_ONCE(refcount_read(&io->ref));
+ WARN_ON_ONCE(io->task_registered_buffers);
+ }
- if (ubq->ubq_daemon)
- put_task_struct(ubq->ubq_daemon);
if (ubq->io_cmd_buf)
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
}
@@ -2373,7 +2636,7 @@ static void ublk_deinit_queues(struct ublk_device *ub)
for (i = 0; i < nr_queues; i++)
ublk_deinit_queue(ub, i);
- kfree(ub->__queues);
+ kvfree(ub->__queues);
}
static int ublk_init_queues(struct ublk_device *ub)
@@ -2384,7 +2647,7 @@ static int ublk_init_queues(struct ublk_device *ub)
int i, ret = -ENOMEM;
ub->queue_size = ubq_size;
- ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
+ ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
if (!ub->__queues)
return ret;
@@ -2440,6 +2703,7 @@ static void ublk_cdev_rel(struct device *dev)
ublk_deinit_queues(ub);
ublk_free_dev_number(ub);
mutex_destroy(&ub->mutex);
+ mutex_destroy(&ub->cancel_mutex);
kfree(ub);
}
@@ -2487,7 +2751,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
}
@@ -2589,6 +2852,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
+ if (ub->ublksrv_tgid != ublksrv_pid)
+ return -EINVAL;
+
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2610,8 +2876,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
ublk_apply_params(ub);
- /* don't probe partitions if any one ubq daemon is un-trusted */
- if (ub->nr_privileged_daemon != ub->nr_queues_ready)
+ /* don't probe partitions if any daemon task is un-trusted */
+ if (ub->unprivileged_daemons)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
ublk_get_device(ub);
@@ -2710,6 +2976,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
if (copy_from_user(&info, argp, sizeof(info)))
return -EFAULT;
+ if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
+ info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
+ return -EINVAL;
+
if (capable(CAP_SYS_ADMIN))
info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
@@ -2728,6 +2998,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
return -EINVAL;
}
+ if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
+ pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
+ return -EINVAL;
+ }
+
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@@ -2744,8 +3019,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
* For USER_COPY, we depends on userspace to fill request
* buffer by pwrite() to ublk char device, which can't be
* used for unprivileged device
+ *
+ * Same with zero copy or auto buffer register.
*/
- if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
return -EINVAL;
}
@@ -2781,6 +3059,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
+ mutex_init(&ub->cancel_mutex);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2800,10 +3079,13 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ub->dev_info.flags &= UBLK_F_ALL;
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
- UBLK_F_URING_CMD_COMP_IN_TASK;
+ UBLK_F_URING_CMD_COMP_IN_TASK |
+ UBLK_F_PER_IO_DAEMON |
+ UBLK_F_BUF_REG_OFF_DAEMON;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
- if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/*
@@ -2849,6 +3131,7 @@ out_free_dev_number:
ublk_free_dev_number(ub);
out_free_ub:
mutex_destroy(&ub->mutex);
+ mutex_destroy(&ub->cancel_mutex);
kfree(ub);
out_unlock:
mutex_unlock(&ublk_ctl_mutex);
@@ -3064,14 +3347,17 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
int ublksrv_pid = (int)header->data[0];
int ret = -EINVAL;
- pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
- __func__, ub->dev_info.nr_hw_queues, header->dev_id);
- /* wait until new ubq_daemon sending all FETCH_REQ */
+ pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
+ header->dev_id);
+
if (wait_for_completion_interruptible(&ub->completion))
return -EINTR;
- pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
- __func__, ub->dev_info.nr_hw_queues, header->dev_id);
+ pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
+ header->dev_id);
+
+ if (ub->ublksrv_tgid != ublksrv_pid)
+ return -EINVAL;
mutex_lock(&ub->mutex);
if (ublk_nosrv_should_stop_dev(ub))
@@ -3106,6 +3392,125 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
return 0;
}
+static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
+{
+ struct ublk_param_basic *p = &ub->params.basic;
+ u64 new_size = header->data[0];
+
+ mutex_lock(&ub->mutex);
+ p->dev_sectors = new_size;
+ set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
+ mutex_unlock(&ub->mutex);
+}
+
+struct count_busy {
+ const struct ublk_queue *ubq;
+ unsigned int nr_busy;
+};
+
+static bool ublk_count_busy_req(struct request *rq, void *data)
+{
+ struct count_busy *idle = data;
+
+ if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
+ idle->nr_busy += 1;
+ return true;
+}
+
+/* uring_cmd is guaranteed to be active if the associated request is idle */
+static bool ubq_has_idle_io(const struct ublk_queue *ubq)
+{
+ struct count_busy data = {
+ .ubq = ubq,
+ };
+
+ blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
+ return data.nr_busy < ubq->q_depth;
+}
+
+/* Wait until each hw queue has at least one idle IO */
+static int ublk_wait_for_idle_io(struct ublk_device *ub,
+ unsigned int timeout_ms)
+{
+ unsigned int elapsed = 0;
+ int ret;
+
+ while (elapsed < timeout_ms && !signal_pending(current)) {
+ unsigned int queues_cancelable = 0;
+ int i;
+
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ queues_cancelable += !!ubq_has_idle_io(ubq);
+ }
+
+ /*
+ * Each queue needs at least one active command for
+ * notifying ublk server
+ */
+ if (queues_cancelable == ub->dev_info.nr_hw_queues)
+ break;
+
+ msleep(UBLK_REQUEUE_DELAY_MS);
+ elapsed += UBLK_REQUEUE_DELAY_MS;
+ }
+
+ if (signal_pending(current))
+ ret = -EINTR;
+ else if (elapsed >= timeout_ms)
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
+ const struct ublksrv_ctrl_cmd *header)
+{
+ /* zero means wait forever */
+ u64 timeout_ms = header->data[0];
+ struct gendisk *disk;
+ int ret = -ENODEV;
+
+ if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&ub->mutex);
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ goto unlock;
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto put_disk;
+
+ ret = 0;
+ /* already in expected state */
+ if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+ goto put_disk;
+
+ /* Mark the device as canceling */
+ mutex_lock(&ub->cancel_mutex);
+ blk_mq_quiesce_queue(disk->queue);
+ ublk_set_canceling(ub, true);
+ blk_mq_unquiesce_queue(disk->queue);
+ mutex_unlock(&ub->cancel_mutex);
+
+ if (!timeout_ms)
+ timeout_ms = UINT_MAX;
+ ret = ublk_wait_for_idle_io(ub, timeout_ms);
+
+put_disk:
+ ublk_put_disk(disk);
+unlock:
+ mutex_unlock(&ub->mutex);
+
+ /* Cancel pending uring_cmd */
+ if (!ret)
+ ublk_cancel_dev(ub);
+ return ret;
+}
+
/*
* All control commands are sent via /dev/ublk-control, so we have to check
* the destination device's permission
@@ -3191,6 +3596,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_SET_PARAMS:
case UBLK_CMD_START_USER_RECOVERY:
case UBLK_CMD_END_USER_RECOVERY:
+ case UBLK_CMD_UPDATE_SIZE:
+ case UBLK_CMD_QUIESCE_DEV:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -3282,6 +3689,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(ub, header);
break;
+ case UBLK_CMD_UPDATE_SIZE:
+ ublk_ctrl_set_size(ub, header);
+ ret = 0;
+ break;
+ case UBLK_CMD_QUIESCE_DEV:
+ ret = ublk_ctrl_quiesce_dev(ub, header);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -3315,6 +3729,7 @@ static int __init ublk_init(void)
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+ BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
init_waitqueue_head(&ublk_idr_wq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7cffea01d868..e649fa67bac1 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
- err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
+ err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
if (err)
goto out;
@@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
vbr->out_hdr.sector = 0;
- err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+ err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
@@ -976,9 +976,8 @@ static int init_vq(struct virtio_blk *vblk)
return -EINVAL;
}
- num_vqs = min_t(unsigned int,
- min_not_zero(num_request_queues, nr_cpu_ids),
- num_vqs);
+ num_vqs = blk_mq_num_possible_queues(
+ min_not_zero(num_request_queues, num_vqs));
num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
new file mode 100644
index 000000000000..a423228e201b
--- /dev/null
+++ b/drivers/block/zloop.c
@@ -0,0 +1,1386 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Christoph Hellwig.
+ * Copyright (c) 2025, Western Digital Corporation or its affiliates.
+ *
+ * Zoned Loop Device driver - exports a zoned block device using one file per
+ * zone as backing storage.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/blkzoned.h>
+#include <linux/pagemap.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+
+/*
+ * Options for adding (and removing) a device.
+ */
+enum {
+ ZLOOP_OPT_ERR = 0,
+ ZLOOP_OPT_ID = (1 << 0),
+ ZLOOP_OPT_CAPACITY = (1 << 1),
+ ZLOOP_OPT_ZONE_SIZE = (1 << 2),
+ ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
+ ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
+ ZLOOP_OPT_BASE_DIR = (1 << 5),
+ ZLOOP_OPT_NR_QUEUES = (1 << 6),
+ ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
+ ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+};
+
+static const match_table_t zloop_opt_tokens = {
+ { ZLOOP_OPT_ID, "id=%d" },
+ { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
+ { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
+ { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
+ { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
+ { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
+ { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
+ { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
+ { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ERR, NULL }
+};
+
+/* Default values for the "add" operation. */
+#define ZLOOP_DEF_ID -1
+#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
+#define ZLOOP_DEF_NR_ZONES 64
+#define ZLOOP_DEF_NR_CONV_ZONES 8
+#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
+#define ZLOOP_DEF_NR_QUEUES 1
+#define ZLOOP_DEF_QUEUE_DEPTH 128
+#define ZLOOP_DEF_BUFFERED_IO false
+
+/* Arbitrary limit on the zone size (16GB). */
+#define ZLOOP_MAX_ZONE_SIZE_MB 16384
+
+struct zloop_options {
+ unsigned int mask;
+ int id;
+ sector_t capacity;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_conv_zones;
+ char *base_dir;
+ unsigned int nr_queues;
+ unsigned int queue_depth;
+ bool buffered_io;
+};
+
+/*
+ * Device states.
+ */
+enum {
+ Zlo_creating = 0,
+ Zlo_live,
+ Zlo_deleting,
+};
+
+enum zloop_zone_flags {
+ ZLOOP_ZONE_CONV = 0,
+ ZLOOP_ZONE_SEQ_ERROR,
+};
+
+struct zloop_zone {
+ struct file *file;
+
+ unsigned long flags;
+ struct mutex lock;
+ enum blk_zone_cond cond;
+ sector_t start;
+ sector_t wp;
+
+ gfp_t old_gfp_mask;
+};
+
+struct zloop_device {
+ unsigned int id;
+ unsigned int state;
+
+ struct blk_mq_tag_set tag_set;
+ struct gendisk *disk;
+
+ struct workqueue_struct *workqueue;
+ bool buffered_io;
+
+ const char *base_dir;
+ struct file *data_dir;
+
+ unsigned int zone_shift;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_zones;
+ unsigned int nr_conv_zones;
+ unsigned int block_size;
+
+ struct zloop_zone zones[] __counted_by(nr_zones);
+};
+
+struct zloop_cmd {
+ struct work_struct work;
+ atomic_t ref;
+ sector_t sector;
+ sector_t nr_sectors;
+ long ret;
+ struct kiocb iocb;
+ struct bio_vec *bvec;
+};
+
+static DEFINE_IDR(zloop_index_idr);
+static DEFINE_MUTEX(zloop_ctl_mutex);
+
+static unsigned int rq_zone_no(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ return blk_rq_pos(rq) >> zlo->zone_shift;
+}
+
+static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ lockdep_assert_held(&zone->lock);
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat (err=%d)\n",
+ zone_no, ret);
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ return ret;
+ }
+
+ file_sectors = stat.size >> SECTOR_SHIFT;
+ if (file_sectors > zlo->zone_capacity) {
+ pr_err("Zone %u file too large (%llu sectors > %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return -EINVAL;
+ }
+
+ if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone %u file size not aligned to block size %u\n",
+ zone_no, zlo->block_size);
+ return -EINVAL;
+ }
+
+ if (!file_sectors) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ } else if (file_sectors == zlo->zone_capacity) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ zone->wp = zone->start + file_sectors;
+ }
+
+ return 0;
+}
+
+static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+ break;
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, 0)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_all_zones(struct zloop_device *zlo)
+{
+ unsigned int i;
+ int ret;
+
+ for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
+ ret = zloop_reset_zone(zlo, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_FULL)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+ unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static void zloop_put_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+ if (!atomic_dec_and_test(&cmd->ref))
+ return;
+ kfree(cmd->bvec);
+ cmd->bvec = NULL;
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_rw_complete(struct kiocb *iocb, long ret)
+{
+ struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
+
+ cmd->ret = ret;
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ sector_t sector = blk_rq_pos(rq);
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+ bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+ int rw = is_write ? ITER_SOURCE : ITER_DEST;
+ struct req_iterator rq_iter;
+ struct zloop_zone *zone;
+ struct iov_iter iter;
+ struct bio_vec tmp;
+ sector_t zone_end;
+ int nr_bvec = 0;
+ int ret;
+
+ atomic_set(&cmd->ref, 2);
+ cmd->sector = sector;
+ cmd->nr_sectors = nr_sectors;
+ cmd->ret = 0;
+
+ /* We should never get an I/O beyond the device capacity. */
+ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
+ ret = -EIO;
+ goto out;
+ }
+ zone = &zlo->zones[zone_no];
+ zone_end = zone->start + zlo->zone_capacity;
+
+ /*
+ * The block layer should never send requests that are not fully
+ * contained within the zone.
+ */
+ if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+ if (ret)
+ goto out;
+ }
+
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+ mutex_lock(&zone->lock);
+
+ if (is_append) {
+ sector = zone->wp;
+ cmd->sector = sector;
+ }
+
+ /*
+ * Write operations must be aligned to the write pointer and
+ * fully contained within the zone capacity.
+ */
+ if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+ zone_no, sector, zone->wp);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /* Implicitly open the target zone. */
+ if (zone->cond == BLK_ZONE_COND_CLOSED ||
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ /*
+ * Advance the write pointer of sequential zones. If the write
+ * fails, the wp position will be corrected when the next I/O
+ * copmpletes.
+ */
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end)
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+
+ rq_for_each_bvec(tmp, rq, rq_iter)
+ nr_bvec++;
+
+ if (rq->bio != rq->biotail) {
+ struct bio_vec *bvec;
+
+ cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
+ if (!cmd->bvec) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /*
+ * The bios of the request may be started from the middle of
+ * the 'bvec' because of bio splitting, so we can't directly
+ * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+ * API will take care of all details for us.
+ */
+ bvec = cmd->bvec;
+ rq_for_each_bvec(tmp, rq, rq_iter) {
+ *bvec = tmp;
+ bvec++;
+ }
+ iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+ } else {
+ /*
+ * Same here, this bio may be started from the middle of the
+ * 'bvec' because of bio splitting, so offset from the bvec
+ * must be passed to iov iterator
+ */
+ iov_iter_bvec(&iter, rw,
+ __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+ nr_bvec, blk_rq_bytes(rq));
+ iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
+ }
+
+ cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+ cmd->iocb.ki_filp = zone->file;
+ cmd->iocb.ki_complete = zloop_rw_complete;
+ if (!zlo->buffered_io)
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+ cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+ if (rw == ITER_SOURCE)
+ ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
+unlock:
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+ mutex_unlock(&zone->lock);
+out:
+ if (ret != -EIOCBQUEUED)
+ zloop_rw_complete(&cmd->iocb, ret);
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_handle_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * zloop_rw() always executes asynchronously or completes
+ * directly.
+ */
+ zloop_rw(cmd);
+ return;
+ case REQ_OP_FLUSH:
+ /*
+ * Sync the entire FS containing the zone files instead of
+ * walking all files
+ */
+ cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
+ break;
+ case REQ_OP_ZONE_RESET:
+ cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ cmd->ret = zloop_reset_all_zones(zlo);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_OPEN:
+ cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ pr_err("Unsupported operation %d\n", req_op(rq));
+ cmd->ret = -EOPNOTSUPP;
+ break;
+ }
+
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_cmd_workfn(struct work_struct *work)
+{
+ struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
+ int orig_flags = current->flags;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ zloop_handle_cmd(cmd);
+ current->flags = orig_flags;
+}
+
+static void zloop_complete_rq(struct request *rq)
+{
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = cmd->sector >> zlo->zone_shift;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ blk_status_t sts = BLK_STS_OK;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
+ zone_no, cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ /* short read */
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ }
+ break;
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
+ zone_no,
+ req_op(rq) == REQ_OP_WRITE ? "" : "append ",
+ cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ pr_err("Zone %u: partial write %ld/%u B\n",
+ zone_no, cmd->ret, blk_rq_bytes(rq));
+ cmd->ret = -EIO;
+ }
+
+ if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ /*
+ * A write to a sequential zone file failed: mark the
+ * zone as having an error. This will be corrected and
+ * cleared when the next IO is submitted.
+ */
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ break;
+ }
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ rq->__sector = cmd->sector;
+
+ break;
+ default:
+ break;
+ }
+
+ if (cmd->ret < 0)
+ sts = errno_to_blk_status(cmd->ret);
+ blk_mq_end_request(rq, sts);
+}
+
+static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ if (zlo->state == Zlo_deleting)
+ return BLK_STS_IOERR;
+
+ blk_mq_start_request(rq);
+
+ INIT_WORK(&cmd->work, zloop_cmd_workfn);
+ queue_work(zlo->workqueue, &cmd->work);
+
+ return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops zloop_mq_ops = {
+ .queue_rq = zloop_queue_rq,
+ .complete = zloop_complete_rq,
+};
+
+static int zloop_open(struct gendisk *disk, blk_mode_t mode)
+{
+ struct zloop_device *zlo = disk->private_data;
+ int ret;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ if (zlo->state != Zlo_live)
+ ret = -ENXIO;
+ mutex_unlock(&zloop_ctl_mutex);
+ return ret;
+}
+
+static int zloop_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct zloop_device *zlo = disk->private_data;
+ struct blk_zone blkz = {};
+ unsigned int first, i;
+ int ret;
+
+ first = disk_zone_no(disk, sector);
+ if (first >= zlo->nr_zones)
+ return 0;
+ nr_zones = min(nr_zones, zlo->nr_zones - first);
+
+ for (i = 0; i < nr_zones; i++) {
+ unsigned int zone_no = first + i;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret) {
+ mutex_unlock(&zone->lock);
+ return ret;
+ }
+ }
+
+ blkz.start = zone->start;
+ blkz.len = zlo->zone_size;
+ blkz.wp = zone->wp;
+ blkz.cond = zone->cond;
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ blkz.capacity = zlo->zone_size;
+ } else {
+ blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ blkz.capacity = zlo->zone_capacity;
+ }
+
+ mutex_unlock(&zone->lock);
+
+ ret = cb(&blkz, i, data);
+ if (ret)
+ return ret;
+ }
+
+ return nr_zones;
+}
+
+static void zloop_free_disk(struct gendisk *disk)
+{
+ struct zloop_device *zlo = disk->private_data;
+ unsigned int i;
+
+ blk_mq_free_tag_set(&zlo->tag_set);
+
+ for (i = 0; i < zlo->nr_zones; i++) {
+ struct zloop_zone *zone = &zlo->zones[i];
+
+ mapping_set_gfp_mask(zone->file->f_mapping,
+ zone->old_gfp_mask);
+ fput(zone->file);
+ }
+
+ fput(zlo->data_dir);
+ destroy_workqueue(zlo->workqueue);
+ kfree(zlo->base_dir);
+ kvfree(zlo);
+}
+
+static const struct block_device_operations zloop_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_open,
+ .report_zones = zloop_report_zones,
+ .free_disk = zloop_free_disk,
+};
+
+__printf(3, 4)
+static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
+ const char *fmt, ...)
+{
+ struct file *file;
+ va_list ap;
+ char *p;
+
+ va_start(ap, fmt);
+ p = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ file = filp_open(p, oflags, mode);
+ kfree(p);
+ return file;
+}
+
+static int zloop_get_block_size(struct zloop_device *zlo,
+ struct zloop_zone *zone)
+{
+ struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
+ struct kstat st;
+
+ /*
+ * If the FS block size is lower than or equal to 4K, use that as the
+ * device block size. Otherwise, fallback to the FS direct IO alignment
+ * constraint if that is provided, and to the FS underlying device
+ * physical block size if the direct IO alignment is unknown.
+ */
+ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+ zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
+ else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN))
+ zlo->block_size = st.dio_offset_align;
+ else if (sb_bdev)
+ zlo->block_size = bdev_physical_block_size(sb_bdev);
+ else
+ zlo->block_size = SECTOR_SIZE;
+
+ if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone capacity is not aligned to block size %u\n",
+ zlo->block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
+ unsigned int zone_no, bool restore)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int oflags = O_RDWR;
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ mutex_init(&zone->lock);
+ zone->start = (sector_t)zone_no << zlo->zone_shift;
+
+ if (!restore)
+ oflags |= O_CREAT;
+
+ if (!opts->buffered_io)
+ oflags |= O_DIRECT;
+
+ if (zone_no < zlo->nr_conv_zones) {
+ /* Conventional zone file. */
+ set_bit(ZLOOP_ZONE_CONV, &zone->flags);
+ zone->cond = BLK_ZONE_COND_NOT_WP;
+ zone->wp = U64_MAX;
+
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat\n", zone_no);
+ return ret;
+ }
+ file_sectors = stat.size >> SECTOR_SHIFT;
+
+ if (restore && file_sectors != zlo->zone_size) {
+ pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return ret;
+ }
+
+ ret = vfs_truncate(&zone->file->f_path,
+ zlo->zone_size << SECTOR_SHIFT);
+ if (ret < 0) {
+ pr_err("Failed to truncate zone %u file (err=%d)\n",
+ zone_no, ret);
+ return ret;
+ }
+
+ return 0;
+ }
+
+ /* Sequential zone file. */
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ zloop_get_block_size(zlo, zone);
+
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static bool zloop_dev_exists(struct zloop_device *zlo)
+{
+ struct file *cnv, *seq;
+ bool exists;
+
+ cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, 0);
+ seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, 0);
+ exists = !IS_ERR(cnv) || !IS_ERR(seq);
+
+ if (!IS_ERR(cnv))
+ fput(cnv);
+ if (!IS_ERR(seq))
+ fput(seq);
+
+ return exists;
+}
+
+static int zloop_ctl_add(struct zloop_options *opts)
+{
+ struct queue_limits lim = {
+ .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
+ .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
+ .chunk_sectors = opts->zone_size,
+ .features = BLK_FEAT_ZONED,
+ };
+ unsigned int nr_zones, i, j;
+ struct zloop_device *zlo;
+ int ret = -EINVAL;
+ bool restore;
+
+ __module_get(THIS_MODULE);
+
+ nr_zones = opts->capacity >> ilog2(opts->zone_size);
+ if (opts->nr_conv_zones >= nr_zones) {
+ pr_err("Invalid number of conventional zones %u\n",
+ opts->nr_conv_zones);
+ goto out;
+ }
+
+ zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
+ if (!zlo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ zlo->state = Zlo_creating;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ goto out_free_dev;
+
+ /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
+ if (opts->id >= 0) {
+ ret = idr_alloc(&zloop_index_idr, zlo,
+ opts->id, opts->id + 1, GFP_KERNEL);
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ } else {
+ ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
+ }
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret < 0)
+ goto out_free_dev;
+
+ zlo->id = ret;
+ zlo->zone_shift = ilog2(opts->zone_size);
+ zlo->zone_size = opts->zone_size;
+ if (opts->zone_capacity)
+ zlo->zone_capacity = opts->zone_capacity;
+ else
+ zlo->zone_capacity = zlo->zone_size;
+ zlo->nr_zones = nr_zones;
+ zlo->nr_conv_zones = opts->nr_conv_zones;
+ zlo->buffered_io = opts->buffered_io;
+
+ zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
+ opts->nr_queues * opts->queue_depth, zlo->id);
+ if (!zlo->workqueue) {
+ ret = -ENOMEM;
+ goto out_free_idr;
+ }
+
+ if (opts->base_dir)
+ zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
+ else
+ zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
+ if (!zlo->base_dir) {
+ ret = -ENOMEM;
+ goto out_destroy_workqueue;
+ }
+
+ zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
+ zlo->base_dir, zlo->id);
+ if (IS_ERR(zlo->data_dir)) {
+ ret = PTR_ERR(zlo->data_dir);
+ pr_warn("Failed to open directory %s/%u (err=%d)\n",
+ zlo->base_dir, zlo->id, ret);
+ goto out_free_base_dir;
+ }
+
+ /*
+ * If we already have zone files, we are restoring a device created by a
+ * previous add operation. In this case, zloop_init_zone() will check
+ * that the zone files are consistent with the zone configuration given.
+ */
+ restore = zloop_dev_exists(zlo);
+ for (i = 0; i < nr_zones; i++) {
+ ret = zloop_init_zone(zlo, opts, i, restore);
+ if (ret)
+ goto out_close_files;
+ }
+
+ lim.physical_block_size = zlo->block_size;
+ lim.logical_block_size = zlo->block_size;
+
+ zlo->tag_set.ops = &zloop_mq_ops;
+ zlo->tag_set.nr_hw_queues = opts->nr_queues;
+ zlo->tag_set.queue_depth = opts->queue_depth;
+ zlo->tag_set.numa_node = NUMA_NO_NODE;
+ zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
+ zlo->tag_set.driver_data = zlo;
+
+ ret = blk_mq_alloc_tag_set(&zlo->tag_set);
+ if (ret) {
+ pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
+ goto out_close_files;
+ }
+
+ zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
+ if (IS_ERR(zlo->disk)) {
+ pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
+ ret = PTR_ERR(zlo->disk);
+ goto out_cleanup_tags;
+ }
+ zlo->disk->flags = GENHD_FL_NO_PART;
+ zlo->disk->fops = &zloop_fops;
+ zlo->disk->private_data = zlo;
+ sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
+ set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
+
+ ret = blk_revalidate_disk_zones(zlo->disk);
+ if (ret)
+ goto out_cleanup_disk;
+
+ ret = add_disk(zlo->disk);
+ if (ret) {
+ pr_err("add_disk failed (err=%d)\n", ret);
+ goto out_cleanup_disk;
+ }
+
+ mutex_lock(&zloop_ctl_mutex);
+ zlo->state = Zlo_live;
+ mutex_unlock(&zloop_ctl_mutex);
+
+ pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ zlo->id, zlo->nr_zones,
+ ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
+ zlo->block_size);
+
+ return 0;
+
+out_cleanup_disk:
+ put_disk(zlo->disk);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&zlo->tag_set);
+out_close_files:
+ for (j = 0; j < i; j++) {
+ struct zloop_zone *zone = &zlo->zones[j];
+
+ if (!IS_ERR_OR_NULL(zone->file))
+ fput(zone->file);
+ }
+ fput(zlo->data_dir);
+out_free_base_dir:
+ kfree(zlo->base_dir);
+out_destroy_workqueue:
+ destroy_workqueue(zlo->workqueue);
+out_free_idr:
+ mutex_lock(&zloop_ctl_mutex);
+ idr_remove(&zloop_index_idr, zlo->id);
+ mutex_unlock(&zloop_ctl_mutex);
+out_free_dev:
+ kvfree(zlo);
+out:
+ module_put(THIS_MODULE);
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ret;
+}
+
+static int zloop_ctl_remove(struct zloop_options *opts)
+{
+ struct zloop_device *zlo;
+ int ret;
+
+ if (!(opts->mask & ZLOOP_OPT_ID)) {
+ pr_err("No ID specified\n");
+ return -EINVAL;
+ }
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ zlo = idr_find(&zloop_index_idr, opts->id);
+ if (!zlo || zlo->state == Zlo_creating) {
+ ret = -ENODEV;
+ } else if (zlo->state == Zlo_deleting) {
+ ret = -EINVAL;
+ } else {
+ idr_remove(&zloop_index_idr, zlo->id);
+ zlo->state = Zlo_deleting;
+ }
+
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ del_gendisk(zlo->disk);
+ put_disk(zlo->disk);
+
+ pr_info("Removed device %d\n", opts->id);
+
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int zloop_parse_options(struct zloop_options *opts, const char *buf)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *options, *o, *p;
+ unsigned int token;
+ int ret = 0;
+
+ /* Set defaults. */
+ opts->mask = 0;
+ opts->id = ZLOOP_DEF_ID;
+ opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
+ opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
+ opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+ opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
+ opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
+ opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+
+ if (!buf)
+ return 0;
+
+ /* Skip leading spaces before the options. */
+ while (isspace(*buf))
+ buf++;
+
+ options = o = kstrdup(buf, GFP_KERNEL);
+ if (!options)
+ return -ENOMEM;
+
+ /* Parse the options, doing only some light invalid value checks. */
+ while ((p = strsep(&o, ",\n")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, zloop_opt_tokens, args);
+ opts->mask |= token;
+ switch (token) {
+ case ZLOOP_OPT_ID:
+ if (match_int(args, &opts->id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case ZLOOP_OPT_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_SIZE:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
+ !is_power_of_2(token)) {
+ pr_err("Invalid zone size %u\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_size =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid zone capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_NR_CONV_ZONES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_conv_zones = token;
+ break;
+ case ZLOOP_OPT_BASE_DIR:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kfree(opts->base_dir);
+ opts->base_dir = p;
+ break;
+ case ZLOOP_OPT_NR_QUEUES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid number of queues\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_queues = min(token, num_online_cpus());
+ break;
+ case ZLOOP_OPT_QUEUE_DEPTH:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid queue depth\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->queue_depth = token;
+ break;
+ case ZLOOP_OPT_BUFFERED_IO:
+ opts->buffered_io = true;
+ break;
+ case ZLOOP_OPT_ERR:
+ default:
+ pr_warn("unknown parameter or missing value '%s'\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = -EINVAL;
+ if (opts->capacity <= opts->zone_size) {
+ pr_err("Invalid capacity\n");
+ goto out;
+ }
+
+ if (opts->zone_capacity > opts->zone_size) {
+ pr_err("Invalid zone capacity\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(options);
+ return ret;
+}
+
+enum {
+ ZLOOP_CTL_ADD,
+ ZLOOP_CTL_REMOVE,
+};
+
+static struct zloop_ctl_op {
+ int code;
+ const char *name;
+} zloop_ctl_ops[] = {
+ { ZLOOP_CTL_ADD, "add" },
+ { ZLOOP_CTL_REMOVE, "remove" },
+ { -1, NULL },
+};
+
+static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+ struct zloop_options opts = { };
+ struct zloop_ctl_op *op;
+ const char *buf, *opts_buf;
+ int i, ret;
+
+ if (count > PAGE_SIZE)
+ return -ENOMEM;
+
+ buf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
+ op = &zloop_ctl_ops[i];
+ if (!op->name) {
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!strncmp(buf, op->name, strlen(op->name)))
+ break;
+ }
+
+ if (count <= strlen(op->name))
+ opts_buf = NULL;
+ else
+ opts_buf = buf + strlen(op->name);
+
+ ret = zloop_parse_options(&opts, opts_buf);
+ if (ret) {
+ pr_err("Failed to parse options\n");
+ goto out;
+ }
+
+ switch (op->code) {
+ case ZLOOP_CTL_ADD:
+ ret = zloop_ctl_add(&opts);
+ break;
+ case ZLOOP_CTL_REMOVE:
+ ret = zloop_ctl_remove(&opts);
+ break;
+ default:
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ kfree(opts.base_dir);
+ kfree(buf);
+ return ret ? ret : count;
+}
+
+static int zloop_ctl_show(struct seq_file *seq_file, void *private)
+{
+ const struct match_token *tok;
+ int i;
+
+ /* Add operation */
+ seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
+ for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
+ tok = &zloop_opt_tokens[i];
+ if (!tok->pattern)
+ break;
+ if (i)
+ seq_putc(seq_file, ',');
+ seq_puts(seq_file, tok->pattern);
+ }
+ seq_putc(seq_file, '\n');
+
+ /* Remove operation */
+ seq_puts(seq_file, zloop_ctl_ops[1].name);
+ seq_puts(seq_file, " id=%d\n");
+
+ return 0;
+}
+
+static int zloop_ctl_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return single_open(file, zloop_ctl_show, NULL);
+}
+
+static int zloop_ctl_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static const struct file_operations zloop_ctl_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_ctl_open,
+ .release = zloop_ctl_release,
+ .write = zloop_ctl_write,
+ .read = seq_read,
+};
+
+static struct miscdevice zloop_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "zloop-control",
+ .fops = &zloop_ctl_fops,
+};
+
+static int __init zloop_init(void)
+{
+ int ret;
+
+ ret = misc_register(&zloop_misc);
+ if (ret) {
+ pr_err("Failed to register misc device: %d\n", ret);
+ return ret;
+ }
+ pr_info("Module loaded\n");
+
+ return 0;
+}
+
+static void __exit zloop_exit(void)
+{
+ misc_deregister(&zloop_misc);
+ idr_destroy(&zloop_index_idr);
+}
+
+module_init(zloop_init);
+module_exit(zloop_exit);
+
+MODULE_DESCRIPTION("Zoned loopback device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/zram/backend_deflate.c b/drivers/block/zram/backend_deflate.c
index 0f7f252c12f4..b75016e0e654 100644
--- a/drivers/block/zram/backend_deflate.c
+++ b/drivers/block/zram/backend_deflate.c
@@ -8,7 +8,7 @@
#include "backend_deflate.h"
/* Use the same value as crypto API */
-#define DEFLATE_DEF_WINBITS 11
+#define DEFLATE_DEF_WINBITS (-11)
#define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL
struct deflate_ctx {
@@ -22,8 +22,10 @@ static void deflate_release_params(struct zcomp_params *params)
static int deflate_setup_params(struct zcomp_params *params)
{
- if (params->level == ZCOMP_PARAM_NO_LEVEL)
+ if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = Z_DEFAULT_COMPRESSION;
+ if (params->deflate.winbits == ZCOMP_PARAM_NOT_SET)
+ params->deflate.winbits = DEFLATE_DEF_WINBITS;
return 0;
}
@@ -57,13 +59,13 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx)
return -ENOMEM;
ctx->context = zctx;
- sz = zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL);
+ sz = zlib_deflate_workspacesize(params->deflate.winbits, MAX_MEM_LEVEL);
zctx->cctx.workspace = vzalloc(sz);
if (!zctx->cctx.workspace)
goto error;
ret = zlib_deflateInit2(&zctx->cctx, params->level, Z_DEFLATED,
- -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
+ params->deflate.winbits, DEFLATE_DEF_MEMLEVEL,
Z_DEFAULT_STRATEGY);
if (ret != Z_OK)
goto error;
@@ -73,7 +75,7 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx)
if (!zctx->dctx.workspace)
goto error;
- ret = zlib_inflateInit2(&zctx->dctx, -DEFLATE_DEF_WINBITS);
+ ret = zlib_inflateInit2(&zctx->dctx, params->deflate.winbits);
if (ret != Z_OK)
goto error;
diff --git a/drivers/block/zram/backend_lz4.c b/drivers/block/zram/backend_lz4.c
index 847f3334eb38..daccd60857eb 100644
--- a/drivers/block/zram/backend_lz4.c
+++ b/drivers/block/zram/backend_lz4.c
@@ -18,7 +18,7 @@ static void lz4_release_params(struct zcomp_params *params)
static int lz4_setup_params(struct zcomp_params *params)
{
- if (params->level == ZCOMP_PARAM_NO_LEVEL)
+ if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = LZ4_ACCELERATION_DEFAULT;
return 0;
diff --git a/drivers/block/zram/backend_lz4hc.c b/drivers/block/zram/backend_lz4hc.c
index 5f37d5abcaeb..9e8a35dfa56d 100644
--- a/drivers/block/zram/backend_lz4hc.c
+++ b/drivers/block/zram/backend_lz4hc.c
@@ -18,7 +18,7 @@ static void lz4hc_release_params(struct zcomp_params *params)
static int lz4hc_setup_params(struct zcomp_params *params)
{
- if (params->level == ZCOMP_PARAM_NO_LEVEL)
+ if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = LZ4HC_DEFAULT_CLEVEL;
return 0;
diff --git a/drivers/block/zram/backend_zstd.c b/drivers/block/zram/backend_zstd.c
index 22c8067536f3..81defb98ed09 100644
--- a/drivers/block/zram/backend_zstd.c
+++ b/drivers/block/zram/backend_zstd.c
@@ -58,7 +58,7 @@ static int zstd_setup_params(struct zcomp_params *params)
return -ENOMEM;
params->drv_data = zp;
- if (params->level == ZCOMP_PARAM_NO_LEVEL)
+ if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = zstd_default_clevel();
zp->cprm = zstd_get_params(params->level, PAGE_SIZE);
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index d26a58c67e95..b1bd1daa0060 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/cpuhotplug.h>
#include <linux/vmalloc.h>
+#include <linux/sysfs.h>
#include "zcomp.h"
@@ -89,23 +90,21 @@ bool zcomp_available_algorithm(const char *comp)
}
/* show available compressors */
-ssize_t zcomp_available_show(const char *comp, char *buf)
+ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at)
{
- ssize_t sz = 0;
int i;
for (i = 0; i < ARRAY_SIZE(backends) - 1; i++) {
if (!strcmp(comp, backends[i]->name)) {
- sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
- "[%s] ", backends[i]->name);
+ at += sysfs_emit_at(buf, at, "[%s] ",
+ backends[i]->name);
} else {
- sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
- "%s ", backends[i]->name);
+ at += sysfs_emit_at(buf, at, "%s ", backends[i]->name);
}
}
- sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
- return sz;
+ at += sysfs_emit_at(buf, at, "\n");
+ return at;
}
struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 25339ed1e07e..eacfd3f7d61d 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -5,7 +5,11 @@
#include <linux/mutex.h>
-#define ZCOMP_PARAM_NO_LEVEL INT_MIN
+#define ZCOMP_PARAM_NOT_SET INT_MIN
+
+struct deflate_params {
+ s32 winbits;
+};
/*
* Immutable driver (backend) parameters. The driver may attach private
@@ -17,6 +21,9 @@ struct zcomp_params {
void *dict;
size_t dict_sz;
s32 level;
+ union {
+ struct deflate_params deflate;
+ };
void *drv_data;
};
@@ -72,7 +79,7 @@ struct zcomp {
int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node);
int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
-ssize_t zcomp_available_show(const char *comp, char *buf);
+ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at);
bool zcomp_available_algorithm(const char *comp);
struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index fda7d8624889..8acad3cc6e6e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -373,7 +373,7 @@ static ssize_t initstate_show(struct device *dev,
val = init_done(zram);
up_read(&zram->init_lock);
- return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+ return sysfs_emit(buf, "%u\n", val);
}
static ssize_t disksize_show(struct device *dev,
@@ -381,7 +381,7 @@ static ssize_t disksize_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+ return sysfs_emit(buf, "%llu\n", zram->disksize);
}
static ssize_t mem_limit_store(struct device *dev,
@@ -532,7 +532,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev,
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t writeback_limit_store(struct device *dev,
@@ -567,7 +567,7 @@ static ssize_t writeback_limit_show(struct device *dev,
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static void reset_bdev(struct zram *zram)
@@ -734,114 +734,19 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
submit_bio(bio);
}
-#define PAGE_WB_SIG "page_index="
-
-#define PAGE_WRITEBACK 0
-#define HUGE_WRITEBACK (1<<0)
-#define IDLE_WRITEBACK (1<<1)
-#define INCOMPRESSIBLE_WRITEBACK (1<<2)
-
-static int scan_slots_for_writeback(struct zram *zram, u32 mode,
- unsigned long nr_pages,
- unsigned long index,
- struct zram_pp_ctl *ctl)
-{
- for (; nr_pages != 0; index++, nr_pages--) {
- bool ok = true;
-
- zram_slot_lock(zram, index);
- if (!zram_allocated(zram, index))
- goto next;
-
- if (zram_test_flag(zram, index, ZRAM_WB) ||
- zram_test_flag(zram, index, ZRAM_SAME))
- goto next;
-
- if (mode & IDLE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_IDLE))
- goto next;
- if (mode & HUGE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_HUGE))
- goto next;
- if (mode & INCOMPRESSIBLE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
- goto next;
-
- ok = place_pp_slot(zram, ctl, index);
-next:
- zram_slot_unlock(zram, index);
- if (!ok)
- break;
- }
-
- return 0;
-}
-
-static ssize_t writeback_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
{
- struct zram *zram = dev_to_zram(dev);
- unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
- struct zram_pp_ctl *ctl = NULL;
+ unsigned long blk_idx = 0;
+ struct page *page = NULL;
struct zram_pp_slot *pps;
- unsigned long index = 0;
- struct bio bio;
struct bio_vec bio_vec;
- struct page *page = NULL;
- ssize_t ret = len;
- int mode, err;
- unsigned long blk_idx = 0;
-
- if (sysfs_streq(buf, "idle"))
- mode = IDLE_WRITEBACK;
- else if (sysfs_streq(buf, "huge"))
- mode = HUGE_WRITEBACK;
- else if (sysfs_streq(buf, "huge_idle"))
- mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
- else if (sysfs_streq(buf, "incompressible"))
- mode = INCOMPRESSIBLE_WRITEBACK;
- else {
- if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
- return -EINVAL;
-
- if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
- index >= nr_pages)
- return -EINVAL;
-
- nr_pages = 1;
- mode = PAGE_WRITEBACK;
- }
-
- down_read(&zram->init_lock);
- if (!init_done(zram)) {
- ret = -EINVAL;
- goto release_init_lock;
- }
-
- /* Do not permit concurrent post-processing actions. */
- if (atomic_xchg(&zram->pp_in_progress, 1)) {
- up_read(&zram->init_lock);
- return -EAGAIN;
- }
-
- if (!zram->backing_dev) {
- ret = -ENODEV;
- goto release_init_lock;
- }
+ struct bio bio;
+ int ret = 0, err;
+ u32 index;
page = alloc_page(GFP_KERNEL);
- if (!page) {
- ret = -ENOMEM;
- goto release_init_lock;
- }
-
- ctl = init_pp_ctl();
- if (!ctl) {
- ret = -ENOMEM;
- goto release_init_lock;
- }
-
- scan_slots_for_writeback(zram, mode, nr_pages, index, ctl);
+ if (!page)
+ return -ENOMEM;
while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
@@ -929,10 +834,215 @@ next:
if (blk_idx)
free_block_bdev(zram, blk_idx);
-
-release_init_lock:
if (page)
__free_page(page);
+
+ return ret;
+}
+
+#define PAGE_WRITEBACK 0
+#define HUGE_WRITEBACK (1 << 0)
+#define IDLE_WRITEBACK (1 << 1)
+#define INCOMPRESSIBLE_WRITEBACK (1 << 2)
+
+static int parse_page_index(char *val, unsigned long nr_pages,
+ unsigned long *lo, unsigned long *hi)
+{
+ int ret;
+
+ ret = kstrtoul(val, 10, lo);
+ if (ret)
+ return ret;
+ if (*lo >= nr_pages)
+ return -ERANGE;
+ *hi = *lo + 1;
+ return 0;
+}
+
+static int parse_page_indexes(char *val, unsigned long nr_pages,
+ unsigned long *lo, unsigned long *hi)
+{
+ char *delim;
+ int ret;
+
+ delim = strchr(val, '-');
+ if (!delim)
+ return -EINVAL;
+
+ *delim = 0x00;
+ ret = kstrtoul(val, 10, lo);
+ if (ret)
+ return ret;
+ if (*lo >= nr_pages)
+ return -ERANGE;
+
+ ret = kstrtoul(delim + 1, 10, hi);
+ if (ret)
+ return ret;
+ if (*hi >= nr_pages || *lo > *hi)
+ return -ERANGE;
+ *hi += 1;
+ return 0;
+}
+
+static int parse_mode(char *val, u32 *mode)
+{
+ *mode = 0;
+
+ if (!strcmp(val, "idle"))
+ *mode = IDLE_WRITEBACK;
+ if (!strcmp(val, "huge"))
+ *mode = HUGE_WRITEBACK;
+ if (!strcmp(val, "huge_idle"))
+ *mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
+ if (!strcmp(val, "incompressible"))
+ *mode = INCOMPRESSIBLE_WRITEBACK;
+
+ if (*mode == 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int scan_slots_for_writeback(struct zram *zram, u32 mode,
+ unsigned long lo, unsigned long hi,
+ struct zram_pp_ctl *ctl)
+{
+ u32 index = lo;
+
+ while (index < hi) {
+ bool ok = true;
+
+ zram_slot_lock(zram, index);
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ if (zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME))
+ goto next;
+
+ if (mode & IDLE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_IDLE))
+ goto next;
+ if (mode & HUGE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_HUGE))
+ goto next;
+ if (mode & INCOMPRESSIBLE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
+ goto next;
+
+ ok = place_pp_slot(zram, ctl, index);
+next:
+ zram_slot_unlock(zram, index);
+ if (!ok)
+ break;
+ index++;
+ }
+
+ return 0;
+}
+
+static ssize_t writeback_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ u64 nr_pages = zram->disksize >> PAGE_SHIFT;
+ unsigned long lo = 0, hi = nr_pages;
+ struct zram_pp_ctl *ctl = NULL;
+ char *args, *param, *val;
+ ssize_t ret = len;
+ int err, mode = 0;
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ return -EINVAL;
+ }
+
+ /* Do not permit concurrent post-processing actions. */
+ if (atomic_xchg(&zram->pp_in_progress, 1)) {
+ up_read(&zram->init_lock);
+ return -EAGAIN;
+ }
+
+ if (!zram->backing_dev) {
+ ret = -ENODEV;
+ goto release_init_lock;
+ }
+
+ ctl = init_pp_ctl();
+ if (!ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ args = skip_spaces(buf);
+ while (*args) {
+ args = next_arg(args, &param, &val);
+
+ /*
+ * Workaround to support the old writeback interface.
+ *
+ * The old writeback interface has a minor inconsistency and
+ * requires key=value only for page_index parameter, while the
+ * writeback mode is a valueless parameter.
+ *
+ * This is not the case anymore and now all parameters are
+ * required to have values, however, we need to support the
+ * legacy writeback interface format so we check if we can
+ * recognize a valueless parameter as the (legacy) writeback
+ * mode.
+ */
+ if (!val || !*val) {
+ err = parse_mode(param, &mode);
+ if (err) {
+ ret = err;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ break;
+ }
+
+ if (!strcmp(param, "type")) {
+ err = parse_mode(val, &mode);
+ if (err) {
+ ret = err;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ break;
+ }
+
+ if (!strcmp(param, "page_index")) {
+ err = parse_page_index(val, nr_pages, &lo, &hi);
+ if (err) {
+ ret = err;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ continue;
+ }
+
+ if (!strcmp(param, "page_indexes")) {
+ err = parse_page_indexes(val, nr_pages, &lo, &hi);
+ if (err) {
+ ret = err;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ continue;
+ }
+ }
+
+ err = zram_writeback_slots(zram, ctl);
+ if (err)
+ ret = err;
+
+release_init_lock:
release_pp_ctl(zram, ctl);
atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
@@ -1115,12 +1225,13 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
zram->comp_algs[prio] = alg;
}
-static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf)
+static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio,
+ char *buf, ssize_t at)
{
ssize_t sz;
down_read(&zram->init_lock);
- sz = zcomp_available_show(zram->comp_algs[prio], buf);
+ sz = zcomp_available_show(zram->comp_algs[prio], buf, at);
up_read(&zram->init_lock);
return sz;
@@ -1166,13 +1277,15 @@ static void comp_params_reset(struct zram *zram, u32 prio)
struct zcomp_params *params = &zram->params[prio];
vfree(params->dict);
- params->level = ZCOMP_PARAM_NO_LEVEL;
+ params->level = ZCOMP_PARAM_NOT_SET;
+ params->deflate.winbits = ZCOMP_PARAM_NOT_SET;
params->dict_sz = 0;
params->dict = NULL;
}
static int comp_params_store(struct zram *zram, u32 prio, s32 level,
- const char *dict_path)
+ const char *dict_path,
+ struct deflate_params *deflate_params)
{
ssize_t sz = 0;
@@ -1190,6 +1303,7 @@ static int comp_params_store(struct zram *zram, u32 prio, s32 level,
zram->params[prio].dict_sz = sz;
zram->params[prio].level = level;
+ zram->params[prio].deflate.winbits = deflate_params->winbits;
return 0;
}
@@ -1198,11 +1312,14 @@ static ssize_t algorithm_params_store(struct device *dev,
const char *buf,
size_t len)
{
- s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NO_LEVEL;
+ s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NOT_SET;
char *args, *param, *val, *algo = NULL, *dict_path = NULL;
+ struct deflate_params deflate_params;
struct zram *zram = dev_to_zram(dev);
int ret;
+ deflate_params.winbits = ZCOMP_PARAM_NOT_SET;
+
args = skip_spaces(buf);
while (*args) {
args = next_arg(args, &param, &val);
@@ -1233,6 +1350,13 @@ static ssize_t algorithm_params_store(struct device *dev,
dict_path = val;
continue;
}
+
+ if (!strcmp(param, "deflate.winbits")) {
+ ret = kstrtoint(val, 10, &deflate_params.winbits);
+ if (ret)
+ return ret;
+ continue;
+ }
}
/* Lookup priority by algorithm name */
@@ -1254,7 +1378,7 @@ static ssize_t algorithm_params_store(struct device *dev,
if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS)
return -EINVAL;
- ret = comp_params_store(zram, prio, level, dict_path);
+ ret = comp_params_store(zram, prio, level, dict_path, &deflate_params);
return ret ? ret : len;
}
@@ -1264,7 +1388,7 @@ static ssize_t comp_algorithm_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
- return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf);
+ return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0);
}
static ssize_t comp_algorithm_store(struct device *dev,
@@ -1292,8 +1416,8 @@ static ssize_t recomp_algorithm_show(struct device *dev,
if (!zram->comp_algs[prio])
continue;
- sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio);
- sz += __comp_algorithm_show(zram, prio, buf + sz);
+ sz += sysfs_emit_at(buf, sz, "#%d: ", prio);
+ sz += __comp_algorithm_show(zram, prio, buf, sz);
}
return sz;
@@ -1365,7 +1489,7 @@ static ssize_t io_stat_show(struct device *dev,
ssize_t ret;
down_read(&zram->init_lock);
- ret = scnprintf(buf, PAGE_SIZE,
+ ret = sysfs_emit(buf,
"%8llu %8llu 0 %8llu\n",
(u64)atomic64_read(&zram->stats.failed_reads),
(u64)atomic64_read(&zram->stats.failed_writes),
@@ -1395,7 +1519,7 @@ static ssize_t mm_stat_show(struct device *dev,
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
- ret = scnprintf(buf, PAGE_SIZE,
+ ret = sysfs_emit(buf,
"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
@@ -1420,8 +1544,8 @@ static ssize_t bd_stat_show(struct device *dev,
ssize_t ret;
down_read(&zram->init_lock);
- ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu\n",
+ ret = sysfs_emit(buf,
+ "%8llu %8llu %8llu\n",
FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
@@ -1439,7 +1563,7 @@ static ssize_t debug_stat_show(struct device *dev,
ssize_t ret;
down_read(&zram->init_lock);
- ret = scnprintf(buf, PAGE_SIZE,
+ ret = sysfs_emit(buf,
"version: %d\n0 %8llu\n",
version,
(u64)atomic64_read(&zram->stats.miss_free));
@@ -1694,7 +1818,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
*/
handle = zs_malloc(zram->mem_pool, PAGE_SIZE,
GFP_NOIO | __GFP_NOWARN |
- __GFP_HIGHMEM | __GFP_MOVABLE);
+ __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
if (IS_ERR_VALUE(handle))
return PTR_ERR((void *)handle);
@@ -1761,7 +1885,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_NOWARN |
- __GFP_HIGHMEM | __GFP_MOVABLE);
+ __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
if (IS_ERR_VALUE(handle)) {
zcomp_stream_put(zstrm);
return PTR_ERR((void *)handle);
@@ -1981,10 +2105,15 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
* We are holding per-CPU stream mutex and entry lock so better
* avoid direct reclaim. Allocation error is not fatal since
* we still have the old object in the mem_pool.
+ *
+ * XXX: technically, the node we really want here is the node that holds
+ * the original compressed data. But that would require us to modify
+ * zsmalloc API to return this information. For now, we will make do with
+ * the node of the page allocated for recompression.
*/
handle_new = zs_malloc(zram->mem_pool, comp_len_new,
GFP_NOIO | __GFP_NOWARN |
- __GFP_HIGHMEM | __GFP_MOVABLE);
+ __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
if (IS_ERR_VALUE(handle_new)) {
zcomp_stream_put(zstrm);
return PTR_ERR((void *)handle_new);
@@ -2682,7 +2811,7 @@ static ssize_t hot_add_show(const struct class *class,
if (ret < 0)
return ret;
- return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return sysfs_emit(buf, "%d\n", ret);
}
/* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */
static struct class_attribute class_attr_hot_add =