summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig10
-rw-r--r--drivers/block/Makefile4
-rw-r--r--drivers/block/amiflop.c10
-rw-r--r--drivers/block/aoe/aoeblk.c4
-rw-r--r--drivers/block/aoe/aoecmd.c2
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/brd.c75
-rw-r--r--drivers/block/drbd/drbd_int.h39
-rw-r--r--drivers/block/drbd/drbd_main.c59
-rw-r--r--drivers/block/drbd/drbd_nl.c1
-rw-r--r--drivers/block/drbd/drbd_receiver.c264
-rw-r--r--drivers/block/drbd/drbd_worker.c56
-rw-r--r--drivers/block/floppy.c59
-rw-r--r--drivers/block/loop.c45
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c6
-rw-r--r--drivers/block/nbd.c10
-rw-r--r--drivers/block/null_blk/main.c4
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt.c6
-rw-r--r--drivers/block/rnull.rs80
-rw-r--r--drivers/block/rnull/Kconfig13
-rw-r--r--drivers/block/rnull/Makefile3
-rw-r--r--drivers/block/rnull/configfs.rs262
-rw-r--r--drivers/block/rnull/rnull.rs104
-rw-r--r--drivers/block/sunvdc.c7
-rw-r--r--drivers/block/swim.c4
-rw-r--r--drivers/block/ublk_drv.c340
-rw-r--r--drivers/block/virtio_blk.c8
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/block/zloop.c3
-rw-r--r--drivers/block/zram/zram_drv.c33
31 files changed, 815 insertions, 704 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index df38fb364904..77d694448990 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -17,6 +17,7 @@ menuconfig BLK_DEV
if BLK_DEV
source "drivers/block/null_blk/Kconfig"
+source "drivers/block/rnull/Kconfig"
config BLK_DEV_FD
tristate "Normal floppy disk support"
@@ -311,15 +312,6 @@ config VIRTIO_BLK
This is the virtual block driver for virtio. It can be used with
QEMU based VMMs (like KVM or Xen). Say Y or M.
-config BLK_DEV_RUST_NULL
- tristate "Rust null block driver (Experimental)"
- depends on RUST
- help
- This is the Rust implementation of the null block driver. For now it
- is only a minimal stub.
-
- If unsure, say N.
-
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
depends on INET && BLOCK
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index a695ce74ef22..2d8096eb8cdf 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -9,9 +9,6 @@
# needed for trace events
ccflags-y += -I$(src)
-obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
-rnull_mod-y := rnull.o
-
obj-$(CONFIG_MAC_FLOPPY) += swim3.o
obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
obj-$(CONFIG_BLK_DEV_FD) += floppy.o
@@ -38,6 +35,7 @@ obj-$(CONFIG_ZRAM) += zram/
obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 6357d86eafdc..2932b6653b6f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = MINOR(bdev->bd_dev) & 3;
+ struct amiga_floppy_struct *p = disk->private_data;
- geo->heads = unit[drive].type->heads;
- geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
- geo->cylinders = unit[drive].type->tracks;
+ geo->heads = p->type->heads;
+ geo->sectors = p->dtype->sects * p->type->sect_mult;
+ geo->cylinders = p->type->tracks;
return 0;
}
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 00b74a845328..34ead75e7e02 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx,
}
static int
-aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct aoedev *d = bdev->bd_disk->private_data;
+ struct aoedev *d = disk->private_data;
if ((d->flags & DEVFL_UP) == 0) {
printk(KERN_ERR "aoe: disk not up\n");
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 6298f8e271e3..a9affb7c264d 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1761,6 +1761,6 @@ aoecmd_exit(void)
kfree(kts);
kfree(ktiowq);
- free_page((unsigned long) page_address(empty_page));
+ __free_page(empty_page);
empty_page = NULL;
}
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index cdf6e4041bb9..3b21750038ee 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -44,7 +44,7 @@ aoe_init(void)
{
int ret;
- aoe_wq = alloc_workqueue("aoe_wq", 0, 0);
+ aoe_wq = alloc_workqueue("aoe_wq", WQ_PERCPU, 0);
if (!aoe_wq)
return -ENOMEM;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c2eabe14af3..9778259b30d4 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -44,45 +44,74 @@ struct brd_device {
};
/*
- * Look up and return a brd's page for a given sector.
+ * Look up and return a brd's page with reference grabbed for a given sector.
*/
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
- return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ struct page *page;
+ XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+
+ rcu_read_lock();
+repeat:
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page)) {
+ xas_reset(&xas);
+ goto repeat;
+ }
+
+ if (!page)
+ goto out;
+
+ if (!get_page_unless_zero(page)) {
+ xas_reset(&xas);
+ goto repeat;
+ }
+
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(page);
+ xas_reset(&xas);
+ goto repeat;
+ }
+out:
+ rcu_read_unlock();
+
+ return page;
}
/*
* Insert a new page for a given sector, if one does not already exist.
+ * The returned page will grab reference.
*/
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
blk_opf_t opf)
- __releases(rcu)
- __acquires(rcu)
{
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
struct page *page, *ret;
- rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
- if (!page) {
- rcu_read_lock();
+ if (!page)
return ERR_PTR(-ENOMEM);
- }
xa_lock(&brd->brd_pages);
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
page, gfp);
- rcu_read_lock();
- if (ret) {
+ if (!ret) {
+ brd->brd_nr_pages++;
+ get_page(page);
+ xa_unlock(&brd->brd_pages);
+ return page;
+ }
+
+ if (!xa_is_err(ret)) {
+ get_page(ret);
xa_unlock(&brd->brd_pages);
- __free_page(page);
- if (xa_is_err(ret))
- return ERR_PTR(xa_err(ret));
+ put_page(page);
return ret;
}
- brd->brd_nr_pages++;
+
xa_unlock(&brd->brd_pages);
- return page;
+ put_page(page);
+ return ERR_PTR(xa_err(ret));
}
/*
@@ -95,7 +124,7 @@ static void brd_free_pages(struct brd_device *brd)
pgoff_t idx;
xa_for_each(&brd->brd_pages, idx, page) {
- __free_page(page);
+ put_page(page);
cond_resched();
}
@@ -117,7 +146,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
- rcu_read_lock();
page = brd_lookup_page(brd, sector);
if (!page && op_is_write(opf)) {
page = brd_insert_page(brd, sector, opf);
@@ -135,13 +163,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
memset(kaddr, 0, bv.bv_len);
}
kunmap_local(kaddr);
- rcu_read_unlock();
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ if (page)
+ put_page(page);
return true;
out_error:
- rcu_read_unlock();
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
@@ -149,13 +177,6 @@ out_error:
return false;
}
-static void brd_free_one_page(struct rcu_head *head)
-{
- struct page *page = container_of(head, struct page, rcu_head);
-
- __free_page(page);
-}
-
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
@@ -170,7 +191,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- call_rcu(&page->rcu_head, brd_free_one_page);
+ put_page(page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e21492981f7d..f6d6276974ee 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -380,6 +380,9 @@ enum {
/* this is/was a write request */
__EE_WRITE,
+ /* hand back using mempool_free(e, drbd_buffer_page_pool) */
+ __EE_RELEASE_TO_MEMPOOL,
+
/* this is/was a write same request */
__EE_WRITE_SAME,
@@ -402,6 +405,7 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
+#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
@@ -858,7 +862,6 @@ struct drbd_device {
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* need to send P_WRITE_ACK */
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
- struct list_head net_ee; /* zero-copy network send in progress */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
@@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t drbd_request_mempool;
extern mempool_t drbd_ee_mempool;
-/* drbd's page pool, used to buffer data received from the peer,
- * or data requested by the peer.
- *
- * This does not have an emergency reserve.
- *
- * When allocating from this pool, it first takes pages from the pool.
- * Only if the pool is depleted will try to allocate from the system.
- *
- * The assumption is that pages taken from this pool will be processed,
- * and given back, "quickly", and then can be recycled, so we can avoid
- * frequent calls to alloc_page(), and still will be able to make progress even
- * under memory pressure.
- */
-extern struct page *drbd_pp_pool;
-extern spinlock_t drbd_pp_lock;
-extern int drbd_pp_vacant;
-extern wait_queue_head_t drbd_pp_wait;
-
/* We also need a standard (emergency-reserve backed) page pool
* for meta data IO (activity log, bitmap).
* We can keep it global, as long as it is used as "N pages at a time".
@@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait;
*/
#define DRBD_MIN_POOL_PAGES 128
extern mempool_t drbd_md_io_page_pool;
+extern mempool_t drbd_buffer_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */
@@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *,
sector_t, unsigned int,
unsigned int,
gfp_t) __must_hold(local);
-extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
- int);
-#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
-#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req);
extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
extern int drbd_connected(struct drbd_peer_device *);
@@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page)
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
-{
- struct page *page = peer_req->pages;
- page_chain_for_each(page) {
- if (page_count(page) > 1)
- return 1;
- }
- return 0;
-}
-
static inline union drbd_state drbd_read_state(struct drbd_device *device)
{
struct drbd_resource *resource = device->resource;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 52724b79be30..c73376886e7a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t drbd_request_mempool;
mempool_t drbd_ee_mempool;
mempool_t drbd_md_io_page_pool;
+mempool_t drbd_buffer_page_pool;
struct bio_set drbd_md_io_bio_set;
struct bio_set drbd_io_bio_set;
-/* I do not use a standard mempool, because:
- 1) I want to hand out the pre-allocated objects first.
- 2) I want to be able to interrupt sleeping allocation with a signal.
- Note: This is a single linked list, the next pointer is the private
- member of struct page.
- */
-struct page *drbd_pp_pool;
-DEFINE_SPINLOCK(drbd_pp_lock);
-int drbd_pp_vacant;
-wait_queue_head_t drbd_pp_wait;
-
DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
static const struct block_device_operations drbd_ops = {
@@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
struct drbd_peer_request *peer_req)
{
+ bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL);
struct page *page = peer_req->pages;
unsigned len = peer_req->i.size;
int err;
@@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- err = _drbd_send_page(peer_device, page, 0, l,
- page_chain_next(page) ? MSG_MORE : 0);
+ if (likely(use_sendpage))
+ err = _drbd_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ else
+ err = _drbd_no_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+
if (err)
return err;
len -= l;
@@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device)
INIT_LIST_HEAD(&device->sync_ee);
INIT_LIST_HEAD(&device->done_ee);
INIT_LIST_HEAD(&device->read_ee);
- INIT_LIST_HEAD(&device->net_ee);
INIT_LIST_HEAD(&device->resync_reads);
INIT_LIST_HEAD(&device->resync_work.list);
INIT_LIST_HEAD(&device->unplug_work.list);
@@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device)
D_ASSERT(device, list_empty(&device->sync_ee));
D_ASSERT(device, list_empty(&device->done_ee));
D_ASSERT(device, list_empty(&device->read_ee));
- D_ASSERT(device, list_empty(&device->net_ee));
D_ASSERT(device, list_empty(&device->resync_reads));
D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
D_ASSERT(device, list_empty(&device->resync_work.list));
@@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device)
static void drbd_destroy_mempools(void)
{
- struct page *page;
-
- while (drbd_pp_pool) {
- page = drbd_pp_pool;
- drbd_pp_pool = (struct page *)page_private(page);
- __free_page(page);
- drbd_pp_vacant--;
- }
-
/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
bioset_exit(&drbd_io_bio_set);
bioset_exit(&drbd_md_io_bio_set);
+ mempool_exit(&drbd_buffer_page_pool);
mempool_exit(&drbd_md_io_page_pool);
mempool_exit(&drbd_ee_mempool);
mempool_exit(&drbd_request_mempool);
@@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void)
static int drbd_create_mempools(void)
{
- struct page *page;
const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
- int i, ret;
+ int ret;
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
+ ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0);
+ if (ret)
+ goto Enomem;
+
ret = mempool_init_slab_pool(&drbd_request_mempool, number,
drbd_request_cache);
if (ret)
@@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
- for (i = 0; i < number; i++) {
- page = alloc_page(GFP_HIGHUSER);
- if (!page)
- goto Enomem;
- set_page_private(page, (unsigned long)drbd_pp_pool);
- drbd_pp_pool = page;
- }
- drbd_pp_vacant = number;
-
return 0;
Enomem:
@@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device)
rr = drbd_free_peer_reqs(device, &device->done_ee);
if (rr)
drbd_err(device, "%d EEs in done list found!\n", rr);
-
- rr = drbd_free_peer_reqs(device, &device->net_ee);
- if (rr)
- drbd_err(device, "%d EEs in net list found!\n", rr);
}
/* caution. no locking. */
@@ -2863,11 +2839,6 @@ static int __init drbd_init(void)
return err;
}
- /*
- * allocate all necessary structs
- */
- init_waitqueue_head(&drbd_pp_wait);
-
drbd_proc = NULL; /* play safe for drbd_cleanup */
idr_init(&drbd_devices);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e09930c2b226..91f3b8afb63c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1330,6 +1330,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device,
lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
else
lim.max_write_zeroes_sectors = 0;
+ lim.max_hw_wzeroes_unmap_sectors = 0;
if ((lim.discard_granularity >> SECTOR_SHIFT) >
lim.max_hw_discard_sectors) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 975024cf03c5..caaf2781136d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
+#include <linux/mempool.h>
#include "drbd_int.h"
#include "drbd_protocol.h"
#include "drbd_req.h"
@@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
-/*
- * some helper functions to deal with single linked page lists,
- * page->private being our "next" pointer.
- */
-
-/* If at least n pages are linked at head, get n pages off.
- * Otherwise, don't modify head, and return NULL.
- * Locking is the responsibility of the caller.
- */
-static struct page *page_chain_del(struct page **head, int n)
-{
- struct page *page;
- struct page *tmp;
-
- BUG_ON(!n);
- BUG_ON(!head);
-
- page = *head;
-
- if (!page)
- return NULL;
-
- while (page) {
- tmp = page_chain_next(page);
- if (--n == 0)
- break; /* found sufficient pages */
- if (tmp == NULL)
- /* insufficient pages, don't use any of them. */
- return NULL;
- page = tmp;
- }
-
- /* add end of list marker for the returned list */
- set_page_private(page, 0);
- /* actual return value, and adjustment of head */
- page = *head;
- *head = tmp;
- return page;
-}
-
-/* may be used outside of locks to find the tail of a (usually short)
- * "private" page chain, before adding it back to a global chain head
- * with page_chain_add() under a spinlock. */
-static struct page *page_chain_tail(struct page *page, int *len)
-{
- struct page *tmp;
- int i = 1;
- while ((tmp = page_chain_next(page))) {
- ++i;
- page = tmp;
- }
- if (len)
- *len = i;
- return page;
-}
-
-static int page_chain_free(struct page *page)
-{
- struct page *tmp;
- int i = 0;
- page_chain_for_each_safe(page, tmp) {
- put_page(page);
- ++i;
- }
- return i;
-}
-
-static void page_chain_add(struct page **head,
- struct page *chain_first, struct page *chain_last)
-{
-#if 1
- struct page *tmp;
- tmp = page_chain_tail(chain_first, NULL);
- BUG_ON(tmp != chain_last);
-#endif
-
- /* add chain to head */
- set_page_private(chain_last, (unsigned long)*head);
- *head = chain_first;
-}
-
-static struct page *__drbd_alloc_pages(struct drbd_device *device,
- unsigned int number)
+static struct page *__drbd_alloc_pages(unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
unsigned int i = 0;
- /* Yes, testing drbd_pp_vacant outside the lock is racy.
- * So what. It saves a spin_lock. */
- if (drbd_pp_vacant >= number) {
- spin_lock(&drbd_pp_lock);
- page = page_chain_del(&drbd_pp_pool, number);
- if (page)
- drbd_pp_vacant -= number;
- spin_unlock(&drbd_pp_lock);
- if (page)
- return page;
- }
-
/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
for (i = 0; i < number; i++) {
- tmp = alloc_page(GFP_TRY);
+ tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY);
if (!tmp)
- break;
+ goto fail;
set_page_private(tmp, (unsigned long)page);
page = tmp;
}
-
- if (i == number)
- return page;
-
- /* Not enough pages immediately available this time.
- * No need to jump around here, drbd_alloc_pages will retry this
- * function "soon". */
- if (page) {
- tmp = page_chain_tail(page, NULL);
- spin_lock(&drbd_pp_lock);
- page_chain_add(&drbd_pp_pool, page, tmp);
- drbd_pp_vacant += i;
- spin_unlock(&drbd_pp_lock);
+ return page;
+fail:
+ page_chain_for_each_safe(page, tmp) {
+ set_page_private(page, 0);
+ mempool_free(page, &drbd_buffer_page_pool);
}
return NULL;
}
-static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
- struct list_head *to_be_freed)
-{
- struct drbd_peer_request *peer_req, *tmp;
-
- /* The EEs are always appended to the end of the list. Since
- they are sent in order over the wire, they have to finish
- in order. As soon as we see the first not finished we can
- stop to examine the list... */
-
- list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
- if (drbd_peer_req_has_active_page(peer_req))
- break;
- list_move(&peer_req->w.list, to_be_freed);
- }
-}
-
-static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
-{
- LIST_HEAD(reclaimed);
- struct drbd_peer_request *peer_req, *t;
-
- spin_lock_irq(&device->resource->req_lock);
- reclaim_finished_net_peer_reqs(device, &reclaimed);
- spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
- drbd_free_net_peer_req(device, peer_req);
-}
-
-static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
-{
- struct drbd_peer_device *peer_device;
- int vnr;
-
- rcu_read_lock();
- idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
- struct drbd_device *device = peer_device->device;
- if (!atomic_read(&device->pp_in_use_by_net))
- continue;
-
- kref_get(&device->kref);
- rcu_read_unlock();
- drbd_reclaim_net_peer_reqs(device);
- kref_put(&device->kref, drbd_destroy_device);
- rcu_read_lock();
- }
- rcu_read_unlock();
-}
-
/**
* drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @peer_device: DRBD device.
@@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
bool retry)
{
struct drbd_device *device = peer_device->device;
- struct page *page = NULL;
+ struct page *page;
struct net_conf *nc;
- DEFINE_WAIT(wait);
unsigned int mxb;
rcu_read_lock();
@@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
mxb = nc ? nc->max_buffers : 1000000;
rcu_read_unlock();
- if (atomic_read(&device->pp_in_use) < mxb)
- page = __drbd_alloc_pages(device, number);
-
- /* Try to keep the fast path fast, but occasionally we need
- * to reclaim the pages we lended to the network stack. */
- if (page && atomic_read(&device->pp_in_use_by_net) > 512)
- drbd_reclaim_net_peer_reqs(device);
-
- while (page == NULL) {
- prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
-
- drbd_reclaim_net_peer_reqs(device);
-
- if (atomic_read(&device->pp_in_use) < mxb) {
- page = __drbd_alloc_pages(device, number);
- if (page)
- break;
- }
-
- if (!retry)
- break;
-
- if (signal_pending(current)) {
- drbd_warn(device, "drbd_alloc_pages interrupted!\n");
- break;
- }
-
- if (schedule_timeout(HZ/10) == 0)
- mxb = UINT_MAX;
- }
- finish_wait(&drbd_pp_wait, &wait);
+ if (atomic_read(&device->pp_in_use) >= mxb)
+ schedule_timeout_interruptible(HZ / 10);
+ page = __drbd_alloc_pages(number);
if (page)
atomic_add(number, &device->pp_in_use);
@@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
* Is also used from inside an other spin_lock_irq(&resource->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_device *device, struct page *page)
{
- atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
- int i;
+ struct page *tmp;
+ int i = 0;
if (page == NULL)
return;
- if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
- i = page_chain_free(page);
- else {
- struct page *tmp;
- tmp = page_chain_tail(page, &i);
- spin_lock(&drbd_pp_lock);
- page_chain_add(&drbd_pp_pool, page, tmp);
- drbd_pp_vacant += i;
- spin_unlock(&drbd_pp_lock);
- }
- i = atomic_sub_return(i, a);
+ page_chain_for_each_safe(page, tmp) {
+ set_page_private(page, 0);
+ if (page_count(page) == 1)
+ mempool_free(page, &drbd_buffer_page_pool);
+ else
+ put_page(page);
+ i++;
+ }
+ i = atomic_sub_return(i, &device->pp_in_use);
if (i < 0)
- drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
- is_net ? "pp_in_use_by_net" : "pp_in_use", i);
- wake_up(&drbd_pp_wait);
+ drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
}
/*
@@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
gfpflags_allow_blocking(gfp_mask));
if (!page)
goto fail;
+ if (!mempool_is_saturated(&drbd_buffer_page_pool))
+ peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
}
memset(peer_req, 0, sizeof(*peer_req));
@@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
-void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
- int is_net)
+void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
might_sleep();
if (peer_req->flags & EE_HAS_DIGEST)
kfree(peer_req->digest);
- drbd_free_pages(device, peer_req->pages, is_net);
+ drbd_free_pages(device, peer_req->pages);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
@@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_peer_request *peer_req, *t;
int count = 0;
- int is_net = list == &device->net_ee;
spin_lock_irq(&device->resource->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
- __drbd_free_peer_req(device, peer_req, is_net);
+ drbd_free_peer_req(device, peer_req);
count++;
}
return count;
@@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
static int drbd_finish_peer_reqs(struct drbd_device *device)
{
LIST_HEAD(work_list);
- LIST_HEAD(reclaimed);
struct drbd_peer_request *peer_req, *t;
int err = 0;
spin_lock_irq(&device->resource->req_lock);
- reclaim_finished_net_peer_reqs(device, &reclaimed);
list_splice_init(&device->done_ee, &work_list);
spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
- drbd_free_net_peer_req(device, peer_req);
-
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
@@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
data_size -= len;
}
kunmap(page);
- drbd_free_pages(peer_device->device, page, 0);
+ drbd_free_pages(peer_device->device, page);
return err;
}
@@ -5224,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
put_ldev(device);
}
- /* tcp_close and release of sendpage pages can be deferred. I don't
- * want to use SO_LINGER, because apparently it can be deferred for
- * more than 20 seconds (longest time I checked).
- *
- * Actually we don't care for exactly when the network stack does its
- * put_page(), but release our reference on these pages right here.
- */
- i = drbd_free_peer_reqs(device, &device->net_ee);
- if (i)
- drbd_info(device, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&device->pp_in_use_by_net);
if (i)
drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
@@ -5980,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi)
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
- conn_reclaim_net_peer_reqs(connection);
-
if (test_and_clear_bit(SEND_PING, &connection->flags)) {
if (drbd_send_ping(connection)) {
drbd_err(connection, "drbd_send_ping has failed\n");
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index a6ea737b3b71..dea3e79d044f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1030,22 +1030,6 @@ out:
return 1;
}
-/* helper */
-static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
-{
- if (drbd_peer_req_has_active_page(peer_req)) {
- /* This might happen if sendpage() has not finished */
- int i = PFN_UP(peer_req->i.size);
- atomic_add(i, &device->pp_in_use_by_net);
- atomic_sub(i, &device->pp_in_use);
- spin_lock_irq(&device->resource->req_lock);
- list_add_tail(&peer_req->w.list, &device->net_ee);
- spin_unlock_irq(&device->resource->req_lock);
- wake_up(&drbd_pp_wait);
- } else
- drbd_free_peer_req(device, peer_req);
-}
-
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
* @w: work object.
@@ -1059,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
@@ -1074,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
}
- dec_unacked(device);
-
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
@@ -1120,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (get_ldev_if_state(device, D_FAILED)) {
@@ -1155,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
/* update resync data with failure */
drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size);
}
-
- dec_unacked(device);
-
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
@@ -1176,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
int err, eq = 0;
if (unlikely(cancel)) {
- drbd_free_peer_req(device, peer_req);
- dec_unacked(device);
- return 0;
+ err = 0;
+ goto out;
}
if (get_ldev(device)) {
@@ -1220,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}
-
- dec_unacked(device);
- move_to_net_ee_or_free(device, peer_req);
-
if (unlikely(err))
drbd_err(device, "drbd_send_block/ack() failed\n");
+out:
+ dec_unacked(device);
+ drbd_free_peer_req(device, peer_req);
+
return err;
}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 24be0c2c4075..5336c3c5ca36 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -163,35 +163,35 @@
/* do print messages for unexpected interrupts */
static int print_unex = 1;
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/workqueue.h>
-#include <linux/fdreg.h>
-#include <linux/fd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
+#include <linux/async.h>
#include <linux/bio.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-#include <linux/fcntl.h>
+#include <linux/compat.h>
#include <linux/delay.h>
-#include <linux/mc146818rtc.h> /* CMOS defines */
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/fd.h>
+#include <linux/fdreg.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
#include <linux/major.h>
-#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h> /* CMOS defines */
+#include <linux/mm.h>
#include <linux/mod_devicetable.h>
+#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/timer.h>
#include <linux/uaccess.h>
-#include <linux/async.h>
-#include <linux/compat.h>
+#include <linux/workqueue.h>
/*
* PS/2 floppies have much slower step rates than regular floppies.
@@ -233,8 +233,6 @@ static unsigned short virtual_dma_port = 0x3f0;
irqreturn_t floppy_interrupt(int irq, void *dev_id);
static int set_dor(int fdc, char mask, char data);
-#define K_64 0x10000 /* 64KB */
-
/* the following is the mask of allowed drives. By default units 2 and
* 3 of both floppy controllers are disabled, because switching on the
* motor of these drives causes system hangs on some PCI computers. drive
@@ -3092,16 +3090,13 @@ static int raw_cmd_copyin(int cmd, void __user *param,
*rcmd = NULL;
loop:
- ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL);
- if (!ptr)
- return -ENOMEM;
+ ptr = memdup_user(param, sizeof(*ptr));
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
*rcmd = ptr;
- ret = copy_from_user(ptr, param, sizeof(*ptr));
ptr->next = NULL;
ptr->buffer_length = 0;
ptr->kernel_data = NULL;
- if (ret)
- return -EFAULT;
param += sizeof(struct floppy_raw_cmd);
if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE)
return -EINVAL;
@@ -3363,9 +3358,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
return 0;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = (long)bdev->bd_disk->private_data;
+ int drive = (long)disk->private_data;
int type = ITYPE(drive_state[drive].fd_device);
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1b6ee91f8eb9..053a086d547e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -137,20 +137,35 @@ static void loop_global_unlock(struct loop_device *lo, bool global)
static int max_part;
static int part_shift;
-static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
+static loff_t lo_calculate_size(struct loop_device *lo, struct file *file)
{
loff_t loopsize;
+ int ret;
+
+ if (S_ISBLK(file_inode(file)->i_mode)) {
+ loopsize = i_size_read(file->f_mapping->host);
+ } else {
+ struct kstat stat;
+
+ /*
+ * Get the accurate file size. This provides better results than
+ * cached inode data, particularly for network filesystems where
+ * metadata may be stale.
+ */
+ ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0);
+ if (ret)
+ return 0;
- /* Compute loopsize in bytes */
- loopsize = i_size_read(file->f_mapping->host);
- if (offset > 0)
- loopsize -= offset;
+ loopsize = stat.size;
+ }
+
+ if (lo->lo_offset > 0)
+ loopsize -= lo->lo_offset;
/* offset is beyond i_size, weird but possible */
if (loopsize < 0)
return 0;
-
- if (sizelimit > 0 && sizelimit < loopsize)
- loopsize = sizelimit;
+ if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+ loopsize = lo->lo_sizelimit;
/*
* Unfortunately, if we want to do I/O on the device,
* the number of 512-byte sectors has to fit into a sector_t.
@@ -158,11 +173,6 @@ static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
return loopsize >> 9;
}
-static loff_t get_loop_size(struct loop_device *lo, struct file *file)
-{
- return get_size(lo->lo_offset, lo->lo_sizelimit, file);
-}
-
/*
* We support direct I/O only if lo_offset is aligned with the logical I/O size
* of backing device, and the logical block size of loop is bigger than that of
@@ -569,7 +579,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
error = -EINVAL;
/* size of the new backing store needs to be the same */
- if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
+ if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file))
goto out_err;
/*
@@ -1063,7 +1073,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
loop_update_dio(lo);
loop_sysfs_init(lo);
- size = get_loop_size(lo, file);
+ size = lo_calculate_size(lo, file);
loop_set_size(lo, size);
/* Order wrt reading lo_state in loop_validate_file(). */
@@ -1255,8 +1265,7 @@ out_unfreeze:
if (partscan)
clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
if (!err && size_changed) {
- loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
- lo->lo_backing_file);
+ loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file);
loop_set_size(lo, new_size);
}
out_unlock:
@@ -1399,7 +1408,7 @@ static int loop_set_capacity(struct loop_device *lo)
if (unlikely(lo->lo_state != Lo_bound))
return -ENXIO;
- size = get_loop_size(lo, lo->lo_backing_file);
+ size = lo_calculate_size(lo, lo->lo_backing_file);
loop_set_size(lo, size);
return 0;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8fc7761397bd..567192e371a8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
* that each partition is also 4KB aligned. Non-aligned partitions adversely
* affects performance.
*
- * @dev Pointer to the block_device strucutre.
+ * @disk Pointer to the gendisk strucutre.
* @geo Pointer to a hd_geometry structure.
*
* return value
* 0 Operation completed successfully.
* -ENOTTY An error occurred while reading the drive capacity.
*/
-static int mtip_block_getgeo(struct block_device *dev,
+static int mtip_block_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct driver_data *dd = dev->bd_disk->private_data;
+ struct driver_data *dd = disk->private_data;
sector_t capacity;
if (!dd)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6463d0e8d0ce..1188f32a5e5e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -311,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
@@ -1217,6 +1217,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
if (!sock)
return NULL;
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index aa163ae9b2aa..f982027e8c85 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -223,7 +223,7 @@ MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed nu
static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
-MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)");
static bool g_fua = true;
module_param_named(fua, g_fua, bool, 0444);
@@ -1179,7 +1179,7 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
memcpy_page(dest, off + count, t_page->page, offset,
temp);
else
- zero_user(dest, off + count, temp);
+ memzero_page(dest, off + count, temp);
count += temp;
sector += temp >> SECTOR_SHIFT;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index faafd7ff43d6..af0e21149dbc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -7389,7 +7389,7 @@ static int __init rbd_init(void)
* The number of active work items is limited by the number of
* rbd devices * queue depth, so leave @max_active at default.
*/
- rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!rbd_wq) {
rc = -ENOMEM;
goto err_out_slab;
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 15627417f12e..f1409e54010a 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen)
rnbd_clt_put_dev(dev);
}
-static int rnbd_client_getgeo(struct block_device *block_device,
+static int rnbd_client_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
u64 size;
- struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+ struct rnbd_clt_dev *dev = disk->private_data;
struct queue_limits *limit = &dev->queue->limits;
size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
@@ -1809,7 +1809,7 @@ static int __init rnbd_client_init(void)
unregister_blkdev(rnbd_client_major, "rnbd");
return err;
}
- rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0);
+ rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", WQ_PERCPU, 0);
if (!rnbd_clt_wq) {
pr_err("Failed to load module, alloc_workqueue failed.\n");
rnbd_clt_destroy_sysfs_files();
diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
deleted file mode 100644
index d07e76ae2c13..000000000000
--- a/drivers/block/rnull.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-//! This is a Rust implementation of the C null block driver.
-//!
-//! Supported features:
-//!
-//! - blk-mq interface
-//! - direct completion
-//! - block size 4k
-//!
-//! The driver is not configurable.
-
-use kernel::{
- alloc::flags,
- block::mq::{
- self,
- gen_disk::{self, GenDisk},
- Operations, TagSet,
- },
- error::Result,
- new_mutex, pr_info,
- prelude::*,
- sync::{Arc, Mutex},
- types::ARef,
-};
-
-module! {
- type: NullBlkModule,
- name: "rnull_mod",
- authors: ["Andreas Hindborg"],
- description: "Rust implementation of the C null block driver",
- license: "GPL v2",
-}
-
-#[pin_data]
-struct NullBlkModule {
- #[pin]
- _disk: Mutex<GenDisk<NullBlkDevice>>,
-}
-
-impl kernel::InPlaceModule for NullBlkModule {
- fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
- pr_info!("Rust null_blk loaded\n");
-
- // Use a immediately-called closure as a stable `try` block
- let disk = /* try */ (|| {
- let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
-
- gen_disk::GenDiskBuilder::new()
- .capacity_sectors(4096 << 11)
- .logical_block_size(4096)?
- .physical_block_size(4096)?
- .rotational(false)
- .build(format_args!("rnullb{}", 0), tagset)
- })();
-
- try_pin_init!(Self {
- _disk <- new_mutex!(disk?, "nullb:disk"),
- })
- }
-}
-
-struct NullBlkDevice;
-
-#[vtable]
-impl Operations for NullBlkDevice {
- #[inline(always)]
- fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
- mq::Request::end_ok(rq)
- .map_err(|_e| kernel::error::code::EIO)
- // We take no refcounts on the request, so we expect to be able to
- // end the request. The request reference must be unique at this
- // point, and so `end_ok` cannot fail.
- .expect("Fatal error - expected to be able to end request");
-
- Ok(())
- }
-
- fn commit_rqs() {}
-}
diff --git a/drivers/block/rnull/Kconfig b/drivers/block/rnull/Kconfig
new file mode 100644
index 000000000000..7bc5b376c128
--- /dev/null
+++ b/drivers/block/rnull/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Rust null block device driver configuration
+
+config BLK_DEV_RUST_NULL
+ tristate "Rust null block driver (Experimental)"
+ depends on RUST && CONFIGFS_FS
+ help
+ This is the Rust implementation of the null block driver. Like
+ the C version, the driver allows the user to create virutal block
+ devices that can be configured via various configuration options.
+
+ If unsure, say N.
diff --git a/drivers/block/rnull/Makefile b/drivers/block/rnull/Makefile
new file mode 100644
index 000000000000..11cfa5e615dc
--- /dev/null
+++ b/drivers/block/rnull/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
+rnull_mod-y := rnull.o
diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs
new file mode 100644
index 000000000000..8498e9bae6fd
--- /dev/null
+++ b/drivers/block/rnull/configfs.rs
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use super::{NullBlkDevice, THIS_MODULE};
+use core::fmt::{Display, Write};
+use kernel::{
+ block::mq::gen_disk::{GenDisk, GenDiskBuilder},
+ c_str,
+ configfs::{self, AttributeOperations},
+ configfs_attrs, new_mutex,
+ page::PAGE_SIZE,
+ prelude::*,
+ str::{kstrtobool_bytes, CString},
+ sync::Mutex,
+};
+use pin_init::PinInit;
+
+pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> {
+ let item_type = configfs_attrs! {
+ container: configfs::Subsystem<Config>,
+ data: Config,
+ child: DeviceConfig,
+ attributes: [
+ features: 0,
+ ],
+ };
+
+ kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {}))
+}
+
+#[pin_data]
+pub(crate) struct Config {}
+
+#[vtable]
+impl AttributeOperations<0> for Config {
+ type Data = Config;
+
+ fn show(_this: &Config, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_str("blocksize,size,rotational,irqmode\n")?;
+ Ok(writer.bytes_written())
+ }
+}
+
+#[vtable]
+impl configfs::GroupOperations for Config {
+ type Child = DeviceConfig;
+
+ fn make_group(
+ &self,
+ name: &CStr,
+ ) -> Result<impl PinInit<configfs::Group<DeviceConfig>, Error>> {
+ let item_type = configfs_attrs! {
+ container: configfs::Group<DeviceConfig>,
+ data: DeviceConfig,
+ attributes: [
+ // Named for compatibility with C null_blk
+ power: 0,
+ blocksize: 1,
+ rotational: 2,
+ size: 3,
+ irqmode: 4,
+ ],
+ };
+
+ Ok(configfs::Group::new(
+ name.try_into()?,
+ item_type,
+ // TODO: cannot coerce new_mutex!() to impl PinInit<_, Error>, so put mutex inside
+ try_pin_init!( DeviceConfig {
+ data <- new_mutex!(DeviceConfigInner {
+ powered: false,
+ block_size: 4096,
+ rotational: false,
+ disk: None,
+ capacity_mib: 4096,
+ irq_mode: IRQMode::None,
+ name: name.try_into()?,
+ }),
+ }),
+ ))
+ }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum IRQMode {
+ None,
+ Soft,
+}
+
+impl TryFrom<u8> for IRQMode {
+ type Error = kernel::error::Error;
+
+ fn try_from(value: u8) -> Result<Self> {
+ match value {
+ 0 => Ok(Self::None),
+ 1 => Ok(Self::Soft),
+ _ => Err(EINVAL),
+ }
+ }
+}
+
+impl Display for IRQMode {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self {
+ Self::None => f.write_str("0")?,
+ Self::Soft => f.write_str("1")?,
+ }
+ Ok(())
+ }
+}
+
+#[pin_data]
+pub(crate) struct DeviceConfig {
+ #[pin]
+ data: Mutex<DeviceConfigInner>,
+}
+
+#[pin_data]
+struct DeviceConfigInner {
+ powered: bool,
+ name: CString,
+ block_size: u32,
+ rotational: bool,
+ capacity_mib: u64,
+ irq_mode: IRQMode,
+ disk: Option<GenDisk<NullBlkDevice>>,
+}
+
+#[vtable]
+impl configfs::AttributeOperations<0> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+
+ if this.data.lock().powered {
+ writer.write_str("1\n")?;
+ } else {
+ writer.write_str("0\n")?;
+ }
+
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ let power_op = kstrtobool_bytes(page)?;
+ let mut guard = this.data.lock();
+
+ if !guard.powered && power_op {
+ guard.disk = Some(NullBlkDevice::new(
+ &guard.name,
+ guard.block_size,
+ guard.rotational,
+ guard.capacity_mib,
+ guard.irq_mode,
+ )?);
+ guard.powered = true;
+ } else if guard.powered && !power_op {
+ drop(guard.disk.take());
+ guard.powered = false;
+ }
+
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<1> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().block_size))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u32>().map_err(|_| EINVAL)?;
+
+ GenDiskBuilder::validate_block_size(value)?;
+ this.data.lock().block_size = value;
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<2> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+
+ if this.data.lock().rotational {
+ writer.write_str("1\n")?;
+ } else {
+ writer.write_str("0\n")?;
+ }
+
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ this.data.lock().rotational = kstrtobool_bytes(page)?;
+
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<3> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().capacity_mib))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u64>().map_err(|_| EINVAL)?;
+
+ this.data.lock().capacity_mib = value;
+ Ok(())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<4> for DeviceConfig {
+ type Data = DeviceConfig;
+
+ fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ let mut writer = kernel::str::Formatter::new(page);
+ writer.write_fmt(fmt!("{}\n", this.data.lock().irq_mode))?;
+ Ok(writer.bytes_written())
+ }
+
+ fn store(this: &DeviceConfig, page: &[u8]) -> Result {
+ if this.data.lock().powered {
+ return Err(EBUSY);
+ }
+
+ let text = core::str::from_utf8(page)?.trim();
+ let value = text.parse::<u8>().map_err(|_| EINVAL)?;
+
+ this.data.lock().irq_mode = IRQMode::try_from(value)?;
+ Ok(())
+ }
+}
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
new file mode 100644
index 000000000000..1ec694d7f1a6
--- /dev/null
+++ b/drivers/block/rnull/rnull.rs
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This is a Rust implementation of the C null block driver.
+
+mod configfs;
+
+use configfs::IRQMode;
+use kernel::{
+ block::{
+ self,
+ mq::{
+ self,
+ gen_disk::{self, GenDisk},
+ Operations, TagSet,
+ },
+ },
+ error::Result,
+ pr_info,
+ prelude::*,
+ sync::Arc,
+ types::ARef,
+};
+use pin_init::PinInit;
+
+module! {
+ type: NullBlkModule,
+ name: "rnull_mod",
+ authors: ["Andreas Hindborg"],
+ description: "Rust implementation of the C null block driver",
+ license: "GPL v2",
+}
+
+#[pin_data]
+struct NullBlkModule {
+ #[pin]
+ configfs_subsystem: kernel::configfs::Subsystem<configfs::Config>,
+}
+
+impl kernel::InPlaceModule for NullBlkModule {
+ fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
+ pr_info!("Rust null_blk loaded\n");
+
+ try_pin_init!(Self {
+ configfs_subsystem <- configfs::subsystem(),
+ })
+ }
+}
+
+struct NullBlkDevice;
+
+impl NullBlkDevice {
+ fn new(
+ name: &CStr,
+ block_size: u32,
+ rotational: bool,
+ capacity_mib: u64,
+ irq_mode: IRQMode,
+ ) -> Result<GenDisk<Self>> {
+ let tagset = Arc::pin_init(TagSet::new(1, 256, 1), GFP_KERNEL)?;
+
+ let queue_data = Box::new(QueueData { irq_mode }, GFP_KERNEL)?;
+
+ gen_disk::GenDiskBuilder::new()
+ .capacity_sectors(capacity_mib << (20 - block::SECTOR_SHIFT))
+ .logical_block_size(block_size)?
+ .physical_block_size(block_size)?
+ .rotational(rotational)
+ .build(fmt!("{}", name.to_str()?), tagset, queue_data)
+ }
+}
+
+struct QueueData {
+ irq_mode: IRQMode,
+}
+
+#[vtable]
+impl Operations for NullBlkDevice {
+ type QueueData = KBox<QueueData>;
+
+ #[inline(always)]
+ fn queue_rq(queue_data: &QueueData, rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
+ match queue_data.irq_mode {
+ IRQMode::None => mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ // We take no refcounts on the request, so we expect to be able to
+ // end the request. The request reference must be unique at this
+ // point, and so `end_ok` cannot fail.
+ .expect("Fatal error - expected to be able to end request"),
+ IRQMode::Soft => mq::Request::complete(rq),
+ }
+ Ok(())
+ }
+
+ fn commit_rqs(_queue_data: &QueueData) {}
+
+ fn complete(rq: ARef<mq::Request<Self>>) {
+ mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ // We take no refcounts on the request, so we expect to be able to
+ // end the request. The request reference must be unique at this
+ // point, and so `end_ok` cannot fail.
+ .expect("Fatal error - expected to be able to end request");
+ }
+}
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7af21fe67671..db1fe9772a4d 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
return vio_dring_avail(dr, VDC_TX_RING_SIZE);
}
-static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct gendisk *disk = bdev->bd_disk;
sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
@@ -1189,7 +1188,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
}
if (port->ldc_timeout)
- mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
+ mod_delayed_work(system_percpu_wq, &port->ldc_reset_timer_work,
round_jiffies(jiffies + HZ * port->ldc_timeout));
mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
return;
@@ -1217,7 +1216,7 @@ static int __init vdc_init(void)
{
int err;
- sunvdc_wq = alloc_workqueue("sunvdc", 0, 0);
+ sunvdc_wq = alloc_workqueue("sunvdc", WQ_PERCPU, 0);
if (!sunvdc_wq)
return -ENOMEM;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index eda33c5eb5e2..416015947ae6 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
-static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct floppy_state *fs = disk->private_data;
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6561d2a561fa..0c74a41a6753 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -201,7 +201,6 @@ struct ublk_queue {
bool force_abort;
bool canceling;
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
- unsigned short nr_io_ready; /* how many ios setup */
spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
@@ -234,11 +233,12 @@ struct ublk_device {
struct ublk_params params;
struct completion completion;
- unsigned int nr_queues_ready;
- unsigned int nr_privileged_daemon;
+ u32 nr_io_ready;
+ bool unprivileged_daemons;
struct mutex cancel_mutex;
bool canceling;
pid_t ublksrv_tgid;
+ struct delayed_work exit_work;
};
/* header of ublk_params */
@@ -251,8 +251,7 @@ static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io,
- size_t offset);
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *
@@ -531,7 +530,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
-static inline void __ublk_complete_rq(struct request *req);
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -663,22 +663,44 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
}
+static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
}
+static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
!ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
+{
+ return !ublk_dev_support_user_copy(ub) &&
+ !ublk_dev_support_zero_copy(ub) &&
+ !ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
/*
@@ -696,6 +718,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
+{
+ return ublk_dev_support_user_copy(ub) ||
+ ublk_dev_support_zero_copy(ub) ||
+ ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
struct ublk_io *io)
{
@@ -710,8 +739,11 @@ static inline bool ublk_get_req_ref(struct ublk_io *io)
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
{
- if (refcount_dec_and_test(&io->ref))
- __ublk_complete_rq(req);
+ if (!refcount_dec_and_test(&io->ref))
+ return;
+
+ /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
+ __ublk_complete_rq(req, io, false);
}
static inline bool ublk_sub_req_ref(struct ublk_io *io)
@@ -727,6 +759,11 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
+static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
+}
+
/* Called in slow path only, keep it noinline for trace purpose */
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
@@ -763,11 +800,9 @@ static inline int __ublk_queue_cmd_buf_size(int depth)
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
}
-static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
-
- return __ublk_queue_cmd_buf_size(ubq->q_depth);
+ return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
}
static int ublk_max_cmd_buf_size(void)
@@ -1018,13 +1053,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
return rq_bytes;
}
-static int ublk_unmap_io(const struct ublk_queue *ubq,
+static int ublk_unmap_io(bool need_map,
const struct request *req,
const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (!ublk_need_map_io(ubq))
+ if (!need_map)
return rq_bytes;
if (ublk_need_unmap_req(req)) {
@@ -1071,13 +1106,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
- enum req_op op = req_op(req);
u32 ublk_op;
- if (!ublk_queue_is_zoned(ubq) &&
- (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
- return BLK_STS_IOERR;
-
switch (req_op(req)) {
case REQ_OP_READ:
ublk_op = UBLK_IO_OP_READ;
@@ -1116,10 +1146,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
}
/* todo: handle partial completion */
-static inline void __ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map)
{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
@@ -1143,7 +1172,7 @@ static inline void __ublk_complete_rq(struct request *req)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
- unmapped_bytes = ublk_unmap_io(ubq, req, io);
+ unmapped_bytes = ublk_unmap_io(need_map, req, io);
/*
* Extremely impossible since we got data filled in just before
@@ -1188,7 +1217,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
/* tell ublksrv one io request is coming */
- io_uring_cmd_done(cmd, res, 0, issue_flags);
+ io_uring_cmd_done(cmd, res, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1389,7 +1418,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
{
blk_status_t res;
- if (unlikely(ubq->fail_io))
+ if (unlikely(READ_ONCE(ubq->fail_io)))
return BLK_STS_TARGET;
/* With recovery feature enabled, force_abort is set in
@@ -1401,7 +1430,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
* Note: force_abort is guaranteed to be seen because it is set
* before request queue is unqiuesced.
*/
- if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
+ if (ublk_nosrv_should_queue_io(ubq) &&
+ unlikely(READ_ONCE(ubq->force_abort)))
return BLK_STS_IOERR;
if (check_cancel && unlikely(ubq->canceling))
@@ -1498,9 +1528,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
- /* All old ioucmds have to be completed */
- ubq->nr_io_ready = 0;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1549,8 +1576,8 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
ub->mm = NULL;
- ub->nr_queues_ready = 0;
- ub->nr_privileged_daemon = 0;
+ ub->nr_io_ready = 0;
+ ub->unprivileged_daemons = false;
ub->ublksrv_tgid = -1;
}
@@ -1594,13 +1621,63 @@ static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
ublk_get_queue(ub, i)->canceling = canceling;
}
-static int ublk_ch_release(struct inode *inode, struct file *filp)
+static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
{
- struct ublk_device *ub = filp->private_data;
+ int i, j;
+
+ if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG)))
+ return false;
+
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ for (j = 0; j < ubq->q_depth; j++) {
+ struct ublk_io *io = &ubq->ios[j];
+ unsigned int refs = refcount_read(&io->ref) +
+ io->task_registered_buffers;
+
+ /*
+ * UBLK_REFCOUNT_INIT or zero means no active
+ * reference
+ */
+ if (refs != UBLK_REFCOUNT_INIT && refs != 0)
+ return true;
+
+ /* reset to zero if the io hasn't active references */
+ refcount_set(&io->ref, 0);
+ io->task_registered_buffers = 0;
+ }
+ }
+ return false;
+}
+
+static void ublk_ch_release_work_fn(struct work_struct *work)
+{
+ struct ublk_device *ub =
+ container_of(work, struct ublk_device, exit_work.work);
struct gendisk *disk;
int i;
/*
+ * For zero-copy and auto buffer register modes, I/O references
+ * might not be dropped naturally when the daemon is killed, but
+ * io_uring guarantees that registered bvec kernel buffers are
+ * unregistered finally when freeing io_uring context, then the
+ * active references are dropped.
+ *
+ * Wait until active references are dropped for avoiding use-after-free
+ *
+ * registered buffer may be unregistered in io_ring's release hander,
+ * so have to wait by scheduling work function for avoiding the two
+ * file release dependency.
+ */
+ if (ublk_check_and_reset_active_ref(ub)) {
+ schedule_delayed_work(&ub->exit_work, 1);
+ return;
+ }
+
+ /*
* disk isn't attached yet, either device isn't live, or it has
* been removed already, so we needn't to do anything
*/
@@ -1644,7 +1721,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* Transition the device to the nosrv state. What exactly this
* means depends on the recovery flags
*/
- blk_mq_quiesce_queue(disk->queue);
if (ublk_nosrv_should_stop_dev(ub)) {
/*
* Allow any pending/future I/O to pass through quickly
@@ -1652,8 +1728,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* waits for all pending I/O to complete
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_get_queue(ub, i)->force_abort = true;
- blk_mq_unquiesce_queue(disk->queue);
+ WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
ublk_stop_dev_unlocked(ub);
} else {
@@ -1663,9 +1738,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
} else {
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_get_queue(ub, i)->fail_io = true;
+ WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
}
- blk_mq_unquiesce_queue(disk->queue);
}
unlock:
mutex_unlock(&ub->mutex);
@@ -1675,6 +1749,23 @@ unlock:
ublk_reset_ch_dev(ub);
out:
clear_bit(UB_STATE_OPEN, &ub->state);
+
+ /* put the reference grabbed in ublk_ch_release() */
+ ublk_put_device(ub);
+}
+
+static int ublk_ch_release(struct inode *inode, struct file *filp)
+{
+ struct ublk_device *ub = filp->private_data;
+
+ /*
+ * Grab ublk device reference, so it won't be gone until we are
+ * really released from work function.
+ */
+ ublk_get_device(ub);
+
+ INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
+ schedule_delayed_work(&ub->exit_work, 0);
return 0;
}
@@ -1709,23 +1800,23 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
__func__, q_id, current->pid, vma->vm_start,
phys_off, (unsigned long)sz);
- if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+ if (sz != ublk_queue_cmd_buf_size(ub))
return -EINVAL;
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+ if (ublk_nosrv_should_reissue_outstanding(ub))
blk_mq_requeue_request(req, false);
else {
io->res = -EIO;
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
}
}
@@ -1745,7 +1836,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
- __ublk_fail_req(ubq, io, io->req);
+ __ublk_fail_req(ub, io, io->req);
}
}
@@ -1807,7 +1898,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
spin_unlock(&ubq->cancel_lock);
if (!done)
- io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
+ io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
}
/*
@@ -1850,9 +1941,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+static inline bool ublk_dev_ready(const struct ublk_device *ub)
{
- return ubq->nr_io_ready == ubq->q_depth;
+ u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth;
+
+ return ub->nr_io_ready == total;
}
static void ublk_cancel_queue(struct ublk_queue *ubq)
@@ -1976,18 +2069,14 @@ static void ublk_reset_io_flags(struct ublk_device *ub)
}
/* device can only be started after all IOs are ready */
-static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+static void ublk_mark_io_ready(struct ublk_device *ub)
__must_hold(&ub->mutex)
{
- ubq->nr_io_ready++;
- if (ublk_queue_ready(ubq)) {
- ub->nr_queues_ready++;
-
- if (capable(CAP_SYS_ADMIN))
- ub->nr_privileged_daemon++;
- }
+ if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
+ ub->unprivileged_daemons = true;
- if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
+ ub->nr_io_ready++;
+ if (ublk_dev_ready(ub)) {
/* now we are ready for handling ublk io request */
ublk_reset_io_flags(ub);
complete_all(&ub->completion);
@@ -2058,11 +2147,11 @@ ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
}
static inline int
-ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
+ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
struct io_uring_cmd *cmd, unsigned long buf_addr,
u16 *buf_idx)
{
- if (ublk_support_auto_buf_reg(ubq))
+ if (ublk_dev_support_auto_buf_reg(ub))
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
io->addr = buf_addr;
@@ -2101,18 +2190,18 @@ static void ublk_io_release(void *priv)
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag,
struct ublk_io *io,
unsigned int index, unsigned int issue_flags)
{
- struct ublk_device *ub = cmd->file->private_data;
struct request *req;
int ret;
- if (!ublk_support_zero_copy(ubq))
+ if (!ublk_dev_support_zero_copy(ub))
return -EINVAL;
- req = __ublk_check_and_get_req(ub, ubq, io, 0);
+ req = __ublk_check_and_get_req(ub, q_id, tag, io, 0);
if (!req)
return -EINVAL;
@@ -2128,7 +2217,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
static int
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, struct ublk_io *io,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag, struct ublk_io *io,
unsigned index, unsigned issue_flags)
{
unsigned new_registered_buffers;
@@ -2141,9 +2231,10 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
*/
new_registered_buffers = io->task_registered_buffers + 1;
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
- return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
+ issue_flags);
- if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
+ if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
@@ -2165,14 +2256,14 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
-static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
+static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
{
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
*/
- if (!buf_addr && !ublk_need_get_data(ubq))
+ if (!buf_addr && !ublk_dev_need_get_data(ub))
return -EINVAL;
} else if (buf_addr) {
/* User copy requires addr to be unset */
@@ -2181,10 +2272,9 @@ static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
return 0;
}
-static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
- struct ublk_device *ub = ubq->dev;
int ret = 0;
/*
@@ -2193,8 +2283,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
*/
mutex_lock(&ub->mutex);
- /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
- if (ublk_queue_ready(ubq)) {
+ /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
+ if (ublk_dev_ready(ub)) {
ret = -EBUSY;
goto out;
}
@@ -2208,28 +2298,28 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
if (ret)
goto out;
WRITE_ONCE(io->task, get_task_struct(current));
- ublk_mark_io_ready(ub, ubq);
+ ublk_mark_io_ready(ub);
out:
mutex_unlock(&ub->mutex);
return ret;
}
-static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
+static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
struct request *req = io->req;
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
- if (!buf_addr && (!ublk_need_get_data(ubq) ||
+ if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
req_op(req) == REQ_OP_READ))
return -EINVAL;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
@@ -2243,10 +2333,10 @@ static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
return 0;
}
-static bool ublk_need_complete_req(const struct ublk_queue *ubq,
+static bool ublk_need_complete_req(const struct ublk_device *ub,
struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq))
+ if (ublk_dev_need_req_ref(ub))
return ublk_sub_req_ref(io);
return true;
}
@@ -2269,23 +2359,28 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
return ublk_start_io(ubq, req, io);
}
-static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
- unsigned int issue_flags,
- const struct ublksrv_io_cmd *ub_cmd)
+static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
+ /* May point to userspace-mapped memory */
+ const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
- unsigned tag = ub_cmd->tag;
+ u16 q_id = READ_ONCE(ub_src->q_id);
+ u16 tag = READ_ONCE(ub_src->tag);
+ s32 result = READ_ONCE(ub_src->result);
+ u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
struct request *req;
int ret;
bool compl;
+ WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
+
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
- __func__, cmd->cmd_op, ub_cmd->q_id, tag,
- ub_cmd->result);
+ __func__, cmd->cmd_op, q_id, tag, result);
ret = ublk_check_cmd_op(cmd_op);
if (ret)
@@ -2296,25 +2391,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
- return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
- issue_flags);
+ return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
ret = -EINVAL;
- if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+ if (q_id >= ub->dev_info.nr_hw_queues)
goto out;
- ubq = ublk_get_queue(ub, ub_cmd->q_id);
+ ubq = ublk_get_queue(ub, q_id);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
goto out;
io = &ubq->ios[tag];
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
- ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
+ ret = ublk_check_fetch_buf(ub, addr);
if (ret)
goto out;
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ ret = ublk_fetch(cmd, ub, io, addr);
if (ret)
goto out;
@@ -2328,8 +2422,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so can be handled on any task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
- return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
- issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io,
+ addr, issue_flags);
goto out;
}
@@ -2350,24 +2444,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
+ ret = ublk_check_commit_and_fetch(ub, io, addr);
if (ret)
goto out;
- io->res = ub_cmd->result;
+ io->res = result;
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
- compl = ublk_need_complete_req(ubq, io);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
+ compl = ublk_need_complete_req(ub, io);
/* can't touch 'ublk_io' any more */
if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
+ req->__sector = addr;
if (compl)
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
if (ret)
goto out;
@@ -2379,7 +2473,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* request
*/
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
WARN_ON_ONCE(ret);
if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req);
@@ -2399,16 +2493,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset)
{
- unsigned tag = io - ubq->ios;
struct request *req;
/*
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
* which would overwrite it with io->cmd
*/
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
if (!req)
return NULL;
@@ -2430,33 +2523,13 @@ fail_put:
return NULL;
}
-static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- /*
- * Not necessary for async retry, but let's keep it simple and always
- * copy the values to avoid any potential reuse.
- */
- const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
- const struct ublksrv_io_cmd ub_cmd = {
- .q_id = READ_ONCE(ub_src->q_id),
- .tag = READ_ONCE(ub_src->tag),
- .result = READ_ONCE(ub_src->result),
- .addr = READ_ONCE(ub_src->addr)
- };
-
- WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
-
- return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
-}
-
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
if (ret != -EIOCBQUEUED)
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ io_uring_cmd_done(cmd, ret, issue_flags);
}
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -2519,17 +2592,14 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
return ERR_PTR(-EINVAL);
ubq = ublk_get_queue(ub, q_id);
- if (!ubq)
- return ERR_PTR(-EINVAL);
-
- if (!ublk_support_user_copy(ubq))
+ if (!ublk_dev_support_user_copy(ub))
return ERR_PTR(-EACCES);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
return ERR_PTR(-EINVAL);
*io = &ubq->ios[tag];
- req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
+ req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off);
if (!req)
return ERR_PTR(-EINVAL);
@@ -2592,7 +2662,7 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub, q_id);
+ int size = ublk_queue_cmd_buf_size(ub);
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
int i;
@@ -2619,7 +2689,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
ubq->q_depth = ub->dev_info.queue_depth;
- size = ublk_queue_cmd_buf_size(ub, q_id);
+ size = ublk_queue_cmd_buf_size(ub);
ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
if (!ptr)
@@ -2880,8 +2950,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
ublk_apply_params(ub);
- /* don't probe partitions if any one ubq daemon is un-trusted */
- if (ub->nr_privileged_daemon != ub->nr_queues_ready)
+ /* don't probe partitions if any daemon task is un-trusted */
+ if (ub->unprivileged_daemons)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
ublk_get_device(ub);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index e649fa67bac1..f061420dfb10 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -829,9 +829,9 @@ out:
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
-static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct virtio_blk *vblk = bd->bd_disk->private_data;
+ struct virtio_blk *vblk = disk->private_data;
int ret = 0;
mutex_lock(&vblk->vdev_mutex);
@@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
/* some standard values, similar to sd */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
}
out:
mutex_unlock(&vblk->vdev_mutex);
@@ -1682,7 +1682,7 @@ static int __init virtio_blk_init(void)
{
int error;
- virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+ virtblk_wq = alloc_workqueue("virtio-blk", WQ_PERCPU, 0);
if (!virtblk_wq)
return -ENOMEM;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5babe575c288..04fc6b552c04 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg)
schedule_work(&rinfo->work);
}
-static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg)
{
/* We don't have real geometry info, but let's at least return
values consistent with the size of the device */
- sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
hg->heads = 0xff;
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index 553b1a713ab9..a423228e201b 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -700,6 +700,8 @@ static void zloop_free_disk(struct gendisk *disk)
struct zloop_device *zlo = disk->private_data;
unsigned int i;
+ blk_mq_free_tag_set(&zlo->tag_set);
+
for (i = 0; i < zlo->nr_zones; i++) {
struct zloop_zone *zone = &zlo->zones[i];
@@ -1080,7 +1082,6 @@ static int zloop_ctl_remove(struct zloop_options *opts)
del_gendisk(zlo->disk);
put_disk(zlo->disk);
- blk_mq_free_tag_set(&zlo->tag_set);
pr_info("Removed device %d\n", opts->id);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8acad3cc6e6e..a43074657531 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1085,7 +1085,7 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page,
work.entry = entry;
INIT_WORK_ONSTACK(&work.work, zram_sync_read);
- queue_work(system_unbound_wq, &work.work);
+ queue_work(system_dfl_wq, &work.work);
flush_work(&work.work);
destroy_work_on_stack(&work.work);
@@ -1225,18 +1225,6 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
zram->comp_algs[prio] = alg;
}
-static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio,
- char *buf, ssize_t at)
-{
- ssize_t sz;
-
- down_read(&zram->init_lock);
- sz = zcomp_available_show(zram->comp_algs[prio], buf, at);
- up_read(&zram->init_lock);
-
- return sz;
-}
-
static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
{
char *compressor;
@@ -1387,8 +1375,12 @@ static ssize_t comp_algorithm_show(struct device *dev,
char *buf)
{
struct zram *zram = dev_to_zram(dev);
+ ssize_t sz;
- return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0);
+ down_read(&zram->init_lock);
+ sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0);
+ up_read(&zram->init_lock);
+ return sz;
}
static ssize_t comp_algorithm_store(struct device *dev,
@@ -1412,14 +1404,15 @@ static ssize_t recomp_algorithm_show(struct device *dev,
ssize_t sz = 0;
u32 prio;
+ down_read(&zram->init_lock);
for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
if (!zram->comp_algs[prio])
continue;
sz += sysfs_emit_at(buf, sz, "#%d: ", prio);
- sz += __comp_algorithm_show(zram, prio, buf, sz);
+ sz += zcomp_available_show(zram->comp_algs[prio], buf, sz);
}
-
+ up_read(&zram->init_lock);
return sz;
}
@@ -1795,6 +1788,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill,
u32 index)
{
zram_slot_lock(zram, index);
+ zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_SAME);
zram_set_handle(zram, index, fill);
zram_slot_unlock(zram, index);
@@ -1832,6 +1826,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
kunmap_local(src);
zram_slot_lock(zram, index);
+ zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_HUGE);
zram_set_handle(zram, index, handle);
zram_set_obj_size(zram, index, PAGE_SIZE);
@@ -1855,11 +1850,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
unsigned long element;
bool same_filled;
- /* First, free memory allocated to this slot (if any) */
- zram_slot_lock(zram, index);
- zram_free_page(zram, index);
- zram_slot_unlock(zram, index);
-
mem = kmap_local_page(page);
same_filled = page_same_filled(mem, &element);
kunmap_local(mem);
@@ -1901,6 +1891,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
zcomp_stream_put(zstrm);
zram_slot_lock(zram, index);
+ zram_free_page(zram, index);
zram_set_handle(zram, index, handle);
zram_set_obj_size(zram, index, comp_len);
zram_slot_unlock(zram, index);