diff options
Diffstat (limited to 'drivers/block')
43 files changed, 5237 insertions, 631 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 025b1b77b11a..ecceaaa1a66f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -7,7 +7,7 @@ menuconfig BLK_DEV bool "Block devices" depends on BLOCK default y - ---help--- + help Say Y here to get to see options for various different block device drivers. This option alone does not add any kernel code. @@ -27,7 +27,7 @@ config BLK_DEV_NULL_BLK_FAULT_INJECTION config BLK_DEV_FD tristate "Normal floppy disk support" depends on ARCH_MAY_HAVE_PC_FDC - ---help--- + help If you want to use the floppy disk drive(s) of your PC under Linux, say Y. Information about this driver, especially important for IBM Thinkpad users, is contained in @@ -91,7 +91,7 @@ config GDROM config PARIDE tristate "Parallel port IDE device support" depends on PARPORT_PC - ---help--- + help There are many external CD-ROM and disk devices that connect through your computer's parallel port. Most of them are actually IDE devices using a parallel port IDE adapter. This option enables the PARIDE @@ -124,7 +124,7 @@ source "drivers/block/zram/Kconfig" config BLK_DEV_UMEM tristate "Micro Memory MM5415 Battery Backed RAM support" depends on PCI - ---help--- + help Saying Y here will include support for the MM5415 family of battery backed (Non-volatile) RAM cards. <http://www.umem.com/> @@ -141,7 +141,7 @@ config BLK_DEV_UMEM config BLK_DEV_UBD bool "Virtual block device" depends on UML - ---help--- + help The User-Mode Linux port includes a driver called UBD which will let you access arbitrary files on the host computer as block devices. Unless you know that you do not need such virtual block devices say @@ -150,7 +150,7 @@ config BLK_DEV_UBD config BLK_DEV_UBD_SYNC bool "Always do synchronous disk IO for UBD" depends on BLK_DEV_UBD - ---help--- + help Writes to the virtual block device are not immediately written to the host's disk; this may cause problems if, for example, the User-Mode Linux 'Virtual Machine' uses a journalling filesystem and the host @@ -173,7 +173,7 @@ config BLK_DEV_COW_COMMON config BLK_DEV_LOOP tristate "Loopback device support" - ---help--- + help Saying Y here will allow you to use a regular file as a block device; you can then create a file system on that block device and mount it just as you would mount other block devices such as hard @@ -234,7 +234,7 @@ config BLK_DEV_CRYPTOLOOP select CRYPTO select CRYPTO_CBC depends on BLK_DEV_LOOP - ---help--- + help Say Y here if you want to be able to use the ciphers that are provided by the CryptoAPI as loop transformation. This might be used as hard disk encryption. @@ -249,7 +249,7 @@ source "drivers/block/drbd/Kconfig" config BLK_DEV_NBD tristate "Network block device support" depends on NET - ---help--- + help Saying Y here will allow your computer to be a client for network block devices, i.e. it will be able to use block devices exported by servers (mount file systems on them etc.). Communication between @@ -277,7 +277,7 @@ config BLK_DEV_SKD tristate "STEC S1120 Block Driver" depends on PCI depends on 64BIT - ---help--- + help Saying Y or M here will enable support for the STEC, Inc. S1120 PCIe SSD. @@ -286,7 +286,7 @@ config BLK_DEV_SKD config BLK_DEV_SX8 tristate "Promise SATA SX8 support" depends on PCI - ---help--- + help Saying Y or M here will enable support for the Promise SATA SX8 controllers. @@ -294,7 +294,7 @@ config BLK_DEV_SX8 config BLK_DEV_RAM tristate "RAM block device support" - ---help--- + help Saying Y here will allow you to use a portion of your RAM memory as a block device, so that you can make file systems on it, read and write to it and do all the other things that you can do with normal @@ -428,7 +428,7 @@ config XEN_BLKDEV_BACKEND config VIRTIO_BLK tristate "Virtio block driver" depends on VIRTIO - ---help--- + help This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. @@ -458,4 +458,6 @@ config BLK_DEV_RSXX To compile this driver as a module, choose M here: the module will be called rsxx. +source "drivers/block/rnbd/Kconfig" + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 795facd8cf19..e1f63117ee94 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ +obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o null_blk-objs := null_blk_main.o diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index a27804d71e12..5ca7216e9e01 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -407,7 +407,6 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); - q->backing_dev_info->name = "aoe"; q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE; d->bufpool = mp; d->blkq = gd->queue = q; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aae99a2d7bd4..33d0831c99b6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -620,7 +620,7 @@ struct fifo_buffer { unsigned int head_index; unsigned int size; int total; /* sum of all values */ - int values[0]; + int values[]; }; extern struct fifo_buffer *fifo_alloc(unsigned int fifo_size); @@ -1570,34 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_cork(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_uncork(struct socket *sock) -{ - int val = 0; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_nodelay(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_quickack(struct socket *sock) -{ - int val = 2; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); -} - /* sets the number of 512 byte sectors of our virtual device */ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c094c3c2c5d4..45fbd526c453 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr, /* DRBD protocol "pings" are latency critical. * This is supposed to trigger tcp_push_pending_frames() */ if (!err && (cmd == P_PING || cmd == P_PING_ACK)) - drbd_tcp_nodelay(sock->socket); + tcp_sock_set_nodelay(sock->socket->sk); return err; } diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index e6fc5ad72501..dea59c92ecc1 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -271,7 +271,7 @@ struct p_rs_param { u32 resync_rate; /* Since protocol version 88 and higher. */ - char verify_alg[0]; + char verify_alg[]; } __packed; struct p_rs_param_89 { @@ -305,7 +305,7 @@ struct p_protocol { u32 two_primaries; /* Since protocol version 87 and higher. */ - char integrity_alg[0]; + char integrity_alg[]; } __packed; @@ -360,7 +360,7 @@ struct p_sizes { u16 dds_flags; /* use enum dds_flags here. */ /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */ - struct o_qlim qlim[0]; + struct o_qlim qlim[]; } __packed; struct p_state { @@ -409,7 +409,7 @@ struct p_compressed_bm { */ u8 encoding; - u8 code[0]; + u8 code[]; } __packed; struct p_delay_probe93 { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c15e7083b13a..3a3f2b6a821f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1051,8 +1051,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; @@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str * quickly as possible, and let remote TCP know what we have * received so far. */ if (err == -EAGAIN) { - drbd_tcp_quickack(connection->data.socket); + tcp_sock_set_quickack(connection->data.socket->sk, 2); drbd_unplug_all_devices(connection); } if (err > 0) { @@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } @@ -6162,7 +6161,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -6175,7 +6174,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 840c3aef3c5c..c80a2f1c3c2a 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -21,24 +21,6 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size); -/* Update disk stats at start of I/O request */ -static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) -{ - struct request_queue *q = device->rq_queue; - - generic_start_io_acct(q, bio_op(req->master_bio), - req->i.size >> 9, &device->vdisk->part0); -} - -/* Update disk stats when completing request upwards */ -static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) -{ - struct request_queue *q = device->rq_queue; - - generic_end_io_acct(q, bio_op(req->master_bio), - &device->vdisk->part0, req->start_jif); -} - static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src) { struct drbd_request *req; @@ -263,7 +245,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) start_new_tl_epoch(first_peer_device(device)->connection); /* Update disk stats */ - _drbd_end_io_acct(device, req); + bio_end_io_acct(req->master_bio, req->start_jif); /* If READ failed, * have it be pushed back to the retry work queue, @@ -1222,16 +1204,15 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long bio_endio(bio); return ERR_PTR(-ENOMEM); } - req->start_jif = start_jif; + + /* Update disk stats */ + req->start_jif = bio_start_io_acct(req->master_bio); if (!get_ldev(device)) { bio_put(req->private_bio); req->private_bio = NULL; } - /* Update disk stats */ - _drbd_start_io_acct(device, req); - /* process discards always from our submitter thread */ if (bio_op(bio) == REQ_OP_WRITE_ZEROES || bio_op(bio) == REQ_OP_DISCARD) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0dc019da1f8d..2b89c9f2ca70 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (uncork) { mutex_lock(&connection->data.mutex); if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); mutex_unlock(&connection->data.mutex); } @@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * mutex_lock(&connection->data.mutex); if (connection->data.socket) { if (cork) - drbd_tcp_cork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, true); else if (!uncork) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); } mutex_unlock(&connection->data.mutex); } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c3daa64cb52c..3e9db22db2a8 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -337,8 +337,7 @@ static bool initialized; /* * globals used by 'result()' */ -#define MAX_REPLIES 16 -static unsigned char reply_buffer[MAX_REPLIES]; +static unsigned char reply_buffer[FD_RAW_REPLY_SIZE]; static int inr; /* size of reply buffer, when called from interrupt */ #define ST0 0 #define ST1 1 @@ -595,12 +594,12 @@ static unsigned char in_sector_offset; /* offset within physical sector, static inline unsigned char fdc_inb(int fdc, int reg) { - return fd_inb(fdc_state[fdc].address + reg); + return fd_inb(fdc_state[fdc].address, reg); } static inline void fdc_outb(unsigned char value, int fdc, int reg) { - fd_outb(value, fdc_state[fdc].address + reg); + fd_outb(value, fdc_state[fdc].address, reg); } static inline bool drive_no_geom(int drive) @@ -668,16 +667,12 @@ static struct output_log { static int output_log_pos; -#define current_reqD -1 #define MAXTIMEOUT -2 static void __reschedule_timeout(int drive, const char *message) { unsigned long delay; - if (drive == current_reqD) - drive = current_drive; - if (drive < 0 || drive >= N_DRIVE) { delay = 20UL * HZ; drive = 0; @@ -827,59 +822,70 @@ static int set_dor(int fdc, char mask, char data) return olddor; } -static void twaddle(void) +static void twaddle(int fdc, int drive) { - if (drive_params[current_drive].select_delay) + if (drive_params[drive].select_delay) return; - fdc_outb(fdc_state[current_fdc].dor & ~(0x10 << UNIT(current_drive)), - current_fdc, FD_DOR); - fdc_outb(fdc_state[current_fdc].dor, current_fdc, FD_DOR); - drive_state[current_drive].select_date = jiffies; + fdc_outb(fdc_state[fdc].dor & ~(0x10 << UNIT(drive)), + fdc, FD_DOR); + fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR); + drive_state[drive].select_date = jiffies; } /* - * Reset all driver information about the current fdc. + * Reset all driver information about the specified fdc. * This is needed after a reset, and after a raw command. */ -static void reset_fdc_info(int mode) +static void reset_fdc_info(int fdc, int mode) { int drive; - fdc_state[current_fdc].spec1 = fdc_state[current_fdc].spec2 = -1; - fdc_state[current_fdc].need_configure = 1; - fdc_state[current_fdc].perp_mode = 1; - fdc_state[current_fdc].rawcmd = 0; + fdc_state[fdc].spec1 = fdc_state[fdc].spec2 = -1; + fdc_state[fdc].need_configure = 1; + fdc_state[fdc].perp_mode = 1; + fdc_state[fdc].rawcmd = 0; for (drive = 0; drive < N_DRIVE; drive++) - if (FDC(drive) == current_fdc && + if (FDC(drive) == fdc && (mode || drive_state[drive].track != NEED_1_RECAL)) drive_state[drive].track = NEED_2_RECAL; } -/* selects the fdc and drive, and enables the fdc's input/dma. */ +/* + * selects the fdc and drive, and enables the fdc's input/dma. + * Both current_drive and current_fdc are changed to match the new drive. + */ static void set_fdc(int drive) { - unsigned int new_fdc = current_fdc; + unsigned int fdc; - if (drive >= 0 && drive < N_DRIVE) { - new_fdc = FDC(drive); - current_drive = drive; + if (drive < 0 || drive >= N_DRIVE) { + pr_info("bad drive value %d\n", drive); + return; } - if (new_fdc >= N_FDC) { + + fdc = FDC(drive); + if (fdc >= N_FDC) { pr_info("bad fdc value\n"); return; } - current_fdc = new_fdc; - set_dor(current_fdc, ~0, 8); + + set_dor(fdc, ~0, 8); #if N_FDC > 1 - set_dor(1 - current_fdc, ~8, 0); + set_dor(1 - fdc, ~8, 0); #endif - if (fdc_state[current_fdc].rawcmd == 2) - reset_fdc_info(1); - if (fdc_inb(current_fdc, FD_STATUS) != STATUS_READY) - fdc_state[current_fdc].reset = 1; + if (fdc_state[fdc].rawcmd == 2) + reset_fdc_info(fdc, 1); + if (fdc_inb(fdc, FD_STATUS) != STATUS_READY) + fdc_state[fdc].reset = 1; + + current_drive = drive; + current_fdc = fdc; } -/* locks the driver */ +/* + * locks the driver. + * Both current_drive and current_fdc are changed to match the new drive. + */ static int lock_fdc(int drive) { if (WARN(atomic_read(&usage_count) == 0, @@ -1062,12 +1068,9 @@ static void setup_DMA(void) unsigned long f; if (raw_cmd->length == 0) { - int i; - - pr_info("zero dma transfer size:"); - for (i = 0; i < raw_cmd->cmd_count; i++) - pr_cont("%x,", raw_cmd->cmd[i]); - pr_cont("\n"); + print_hex_dump(KERN_INFO, "zero dma transfer size: ", + DUMP_PREFIX_NONE, 16, 1, + raw_cmd->fullcmd, raw_cmd->cmd_count, false); cont->done(0); fdc_state[current_fdc].reset = 1; return; @@ -1104,62 +1107,62 @@ static void setup_DMA(void) #endif } -static void show_floppy(void); +static void show_floppy(int fdc); /* waits until the fdc becomes ready */ -static int wait_til_ready(void) +static int wait_til_ready(int fdc) { int status; int counter; - if (fdc_state[current_fdc].reset) + if (fdc_state[fdc].reset) return -1; for (counter = 0; counter < 10000; counter++) { - status = fdc_inb(current_fdc, FD_STATUS); + status = fdc_inb(fdc, FD_STATUS); if (status & STATUS_READY) return status; } if (initialized) { - DPRINT("Getstatus times out (%x) on fdc %d\n", status, current_fdc); - show_floppy(); + DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc); + show_floppy(fdc); } - fdc_state[current_fdc].reset = 1; + fdc_state[fdc].reset = 1; return -1; } /* sends a command byte to the fdc */ -static int output_byte(char byte) +static int output_byte(int fdc, char byte) { - int status = wait_til_ready(); + int status = wait_til_ready(fdc); if (status < 0) return -1; if (is_ready_state(status)) { - fdc_outb(byte, current_fdc, FD_DATA); + fdc_outb(byte, fdc, FD_DATA); output_log[output_log_pos].data = byte; output_log[output_log_pos].status = status; output_log[output_log_pos].jiffies = jiffies; output_log_pos = (output_log_pos + 1) % OLOGSIZE; return 0; } - fdc_state[current_fdc].reset = 1; + fdc_state[fdc].reset = 1; if (initialized) { DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n", - byte, current_fdc, status); - show_floppy(); + byte, fdc, status); + show_floppy(fdc); } return -1; } /* gets the response from the fdc */ -static int result(void) +static int result(int fdc) { int i; int status = 0; - for (i = 0; i < MAX_REPLIES; i++) { - status = wait_til_ready(); + for (i = 0; i < FD_RAW_REPLY_SIZE; i++) { + status = wait_til_ready(fdc); if (status < 0) break; status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA; @@ -1169,24 +1172,24 @@ static int result(void) return i; } if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY)) - reply_buffer[i] = fdc_inb(current_fdc, FD_DATA); + reply_buffer[i] = fdc_inb(fdc, FD_DATA); else break; } if (initialized) { DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n", - current_fdc, status, i); - show_floppy(); + fdc, status, i); + show_floppy(fdc); } - fdc_state[current_fdc].reset = 1; + fdc_state[fdc].reset = 1; return -1; } #define MORE_OUTPUT -2 /* does the fdc need more output? */ -static int need_more_output(void) +static int need_more_output(int fdc) { - int status = wait_til_ready(); + int status = wait_til_ready(fdc); if (status < 0) return -1; @@ -1194,13 +1197,13 @@ static int need_more_output(void) if (is_ready_state(status)) return MORE_OUTPUT; - return result(); + return result(fdc); } /* Set perpendicular mode as required, based on data rate, if supported. * 82077 Now tested. 1Mbps data rate only possible with 82077-1. */ -static void perpendicular_mode(void) +static void perpendicular_mode(int fdc) { unsigned char perp_mode; @@ -1215,7 +1218,7 @@ static void perpendicular_mode(void) default: DPRINT("Invalid data rate for perpendicular mode!\n"); cont->done(0); - fdc_state[current_fdc].reset = 1; + fdc_state[fdc].reset = 1; /* * convenient way to return to * redo without too much hassle @@ -1226,12 +1229,12 @@ static void perpendicular_mode(void) } else perp_mode = 0; - if (fdc_state[current_fdc].perp_mode == perp_mode) + if (fdc_state[fdc].perp_mode == perp_mode) return; - if (fdc_state[current_fdc].version >= FDC_82077_ORIG) { - output_byte(FD_PERPENDICULAR); - output_byte(perp_mode); - fdc_state[current_fdc].perp_mode = perp_mode; + if (fdc_state[fdc].version >= FDC_82077_ORIG) { + output_byte(fdc, FD_PERPENDICULAR); + output_byte(fdc, perp_mode); + fdc_state[fdc].perp_mode = perp_mode; } else if (perp_mode) { DPRINT("perpendicular mode not supported by this FDC.\n"); } @@ -1240,16 +1243,15 @@ static void perpendicular_mode(void) static int fifo_depth = 0xa; static int no_fifo; -static int fdc_configure(void) +static int fdc_configure(int fdc) { /* Turn on FIFO */ - output_byte(FD_CONFIGURE); - if (need_more_output() != MORE_OUTPUT) + output_byte(fdc, FD_CONFIGURE); + if (need_more_output(fdc) != MORE_OUTPUT) return 0; - output_byte(0); - output_byte(0x10 | (no_fifo & 0x20) | (fifo_depth & 0xf)); - output_byte(0); /* pre-compensation from track - 0 upwards */ + output_byte(fdc, 0); + output_byte(fdc, 0x10 | (no_fifo & 0x20) | (fifo_depth & 0xf)); + output_byte(fdc, 0); /* pre-compensation from track 0 upwards */ return 1; } @@ -1274,7 +1276,7 @@ static int fdc_configure(void) * * These values are rounded up to the next highest available delay time. */ -static void fdc_specify(void) +static void fdc_specify(int fdc, int drive) { unsigned char spec1; unsigned char spec2; @@ -1286,10 +1288,10 @@ static void fdc_specify(void) int hlt_max_code = 0x7f; int hut_max_code = 0xf; - if (fdc_state[current_fdc].need_configure && - fdc_state[current_fdc].version >= FDC_82072A) { - fdc_configure(); - fdc_state[current_fdc].need_configure = 0; + if (fdc_state[fdc].need_configure && + fdc_state[fdc].version >= FDC_82072A) { + fdc_configure(fdc); + fdc_state[fdc].need_configure = 0; } switch (raw_cmd->rate & 0x03) { @@ -1298,13 +1300,13 @@ static void fdc_specify(void) break; case 1: dtr = 300; - if (fdc_state[current_fdc].version >= FDC_82078) { + if (fdc_state[fdc].version >= FDC_82078) { /* chose the default rate table, not the one * where 1 = 2 Mbps */ - output_byte(FD_DRIVESPEC); - if (need_more_output() == MORE_OUTPUT) { - output_byte(UNIT(current_drive)); - output_byte(0xc0); + output_byte(fdc, FD_DRIVESPEC); + if (need_more_output(fdc) == MORE_OUTPUT) { + output_byte(fdc, UNIT(drive)); + output_byte(fdc, 0xc0); } } break; @@ -1313,14 +1315,14 @@ static void fdc_specify(void) break; } - if (fdc_state[current_fdc].version >= FDC_82072) { + if (fdc_state[fdc].version >= FDC_82072) { scale_dtr = dtr; hlt_max_code = 0x00; /* 0==256msec*dtr0/dtr (not linear!) */ hut_max_code = 0x0; /* 0==256msec*dtr0/dtr (not linear!) */ } /* Convert step rate from microseconds to milliseconds and 4 bits */ - srt = 16 - DIV_ROUND_UP(drive_params[current_drive].srt * scale_dtr / 1000, + srt = 16 - DIV_ROUND_UP(drive_params[drive].srt * scale_dtr / 1000, NOMINAL_DTR); if (slow_floppy) srt = srt / 4; @@ -1328,14 +1330,14 @@ static void fdc_specify(void) SUPBOUND(srt, 0xf); INFBOUND(srt, 0); - hlt = DIV_ROUND_UP(drive_params[current_drive].hlt * scale_dtr / 2, + hlt = DIV_ROUND_UP(drive_params[drive].hlt * scale_dtr / 2, NOMINAL_DTR); if (hlt < 0x01) hlt = 0x01; else if (hlt > 0x7f) hlt = hlt_max_code; - hut = DIV_ROUND_UP(drive_params[current_drive].hut * scale_dtr / 16, + hut = DIV_ROUND_UP(drive_params[drive].hut * scale_dtr / 16, NOMINAL_DTR); if (hut < 0x1) hut = 0x1; @@ -1346,12 +1348,12 @@ static void fdc_specify(void) spec2 = (hlt << 1) | (use_virtual_dma & 1); /* If these parameters did not change, just return with success */ - if (fdc_state[current_fdc].spec1 != spec1 || - fdc_state[current_fdc].spec2 != spec2) { + if (fdc_state[fdc].spec1 != spec1 || + fdc_state[fdc].spec2 != spec2) { /* Go ahead and set spec1 and spec2 */ - output_byte(FD_SPECIFY); - output_byte(fdc_state[current_fdc].spec1 = spec1); - output_byte(fdc_state[current_fdc].spec2 = spec2); + output_byte(fdc, FD_SPECIFY); + output_byte(fdc, fdc_state[fdc].spec1 = spec1); + output_byte(fdc, fdc_state[fdc].spec2 = spec2); } } /* fdc_specify */ @@ -1513,7 +1515,7 @@ static void setup_rw_floppy(void) r = 0; for (i = 0; i < raw_cmd->cmd_count; i++) - r |= output_byte(raw_cmd->cmd[i]); + r |= output_byte(current_fdc, raw_cmd->fullcmd[i]); debugt(__func__, "rw_command"); @@ -1524,7 +1526,7 @@ static void setup_rw_floppy(void) } if (!(flags & FD_RAW_INTR)) { - inr = result(); + inr = result(current_fdc); cont->interrupt(); } else if (flags & FD_RAW_NEED_DISK) fd_watchdog(); @@ -1562,29 +1564,29 @@ static void seek_interrupt(void) floppy_ready(); } -static void check_wp(void) +static void check_wp(int fdc, int drive) { - if (test_bit(FD_VERIFY_BIT, &drive_state[current_drive].flags)) { + if (test_bit(FD_VERIFY_BIT, &drive_state[drive].flags)) { /* check write protection */ - output_byte(FD_GETSTATUS); - output_byte(UNIT(current_drive)); - if (result() != 1) { - fdc_state[current_fdc].reset = 1; + output_byte(fdc, FD_GETSTATUS); + output_byte(fdc, UNIT(drive)); + if (result(fdc) != 1) { + fdc_state[fdc].reset = 1; return; } - clear_bit(FD_VERIFY_BIT, &drive_state[current_drive].flags); + clear_bit(FD_VERIFY_BIT, &drive_state[drive].flags); clear_bit(FD_NEED_TWADDLE_BIT, - &drive_state[current_drive].flags); - debug_dcl(drive_params[current_drive].flags, + &drive_state[drive].flags); + debug_dcl(drive_params[drive].flags, "checking whether disk is write protected\n"); - debug_dcl(drive_params[current_drive].flags, "wp=%x\n", + debug_dcl(drive_params[drive].flags, "wp=%x\n", reply_buffer[ST3] & 0x40); if (!(reply_buffer[ST3] & 0x40)) set_bit(FD_DISK_WRITABLE_BIT, - &drive_state[current_drive].flags); + &drive_state[drive].flags); else clear_bit(FD_DISK_WRITABLE_BIT, - &drive_state[current_drive].flags); + &drive_state[drive].flags); } } @@ -1628,7 +1630,7 @@ static void seek_floppy(void) track = 1; } } else { - check_wp(); + check_wp(current_fdc, current_drive); if (raw_cmd->track != drive_state[current_drive].track && (raw_cmd->flags & FD_RAW_NEED_SEEK)) track = raw_cmd->track; @@ -1639,9 +1641,9 @@ static void seek_floppy(void) } do_floppy = seek_interrupt; - output_byte(FD_SEEK); - output_byte(UNIT(current_drive)); - if (output_byte(track) < 0) { + output_byte(current_fdc, FD_SEEK); + output_byte(current_fdc, UNIT(current_drive)); + if (output_byte(current_fdc, track) < 0) { reset_fdc(); return; } @@ -1742,14 +1744,14 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) do_print = !handler && print_unex && initialized; - inr = result(); + inr = result(current_fdc); if (do_print) print_result("unexpected interrupt", inr); if (inr == 0) { int max_sensei = 4; do { - output_byte(FD_SENSEI); - inr = result(); + output_byte(current_fdc, FD_SENSEI); + inr = result(current_fdc); if (do_print) print_result("sensei", inr); max_sensei--; @@ -1771,8 +1773,8 @@ static void recalibrate_floppy(void) { debugt(__func__, ""); do_floppy = recal_interrupt; - output_byte(FD_RECALIBRATE); - if (output_byte(UNIT(current_drive)) < 0) + output_byte(current_fdc, FD_RECALIBRATE); + if (output_byte(current_fdc, UNIT(current_drive)) < 0) reset_fdc(); } @@ -1782,7 +1784,7 @@ static void recalibrate_floppy(void) static void reset_interrupt(void) { debugt(__func__, ""); - result(); /* get the status ready for set_fdc */ + result(current_fdc); /* get the status ready for set_fdc */ if (fdc_state[current_fdc].reset) { pr_info("reset set in interrupt, calling %ps\n", cont->error); cont->error(); /* a reset just after a reset. BAD! */ @@ -1792,7 +1794,9 @@ static void reset_interrupt(void) /* * reset is done by pulling bit 2 of DOR low for a while (old FDCs), - * or by setting the self clearing bit 7 of STATUS (newer FDCs) + * or by setting the self clearing bit 7 of STATUS (newer FDCs). + * This WILL trigger an interrupt, causing the handlers in the current + * cont's ->redo() to be called via reset_interrupt(). */ static void reset_fdc(void) { @@ -1800,7 +1804,7 @@ static void reset_fdc(void) do_floppy = reset_interrupt; fdc_state[current_fdc].reset = 0; - reset_fdc_info(0); + reset_fdc_info(current_fdc, 0); /* Pseudo-DMA may intercept 'reset finished' interrupt. */ /* Irrelevant for systems with true DMA (i386). */ @@ -1819,7 +1823,7 @@ static void reset_fdc(void) } } -static void show_floppy(void) +static void show_floppy(int fdc) { int i; @@ -1842,7 +1846,7 @@ static void show_floppy(void) print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, reply_buffer, resultsize, true); - pr_info("status=%x\n", fdc_inb(current_fdc, FD_STATUS)); + pr_info("status=%x\n", fdc_inb(fdc, FD_STATUS)); pr_info("fdc_busy=%lu\n", fdc_busy); if (do_floppy) pr_info("do_floppy=%ps\n", do_floppy); @@ -1868,7 +1872,7 @@ static void floppy_shutdown(struct work_struct *arg) unsigned long flags; if (initialized) - show_floppy(); + show_floppy(current_fdc); cancel_activity(); flags = claim_dma_lock(); @@ -1934,7 +1938,7 @@ static void floppy_ready(void) "calling disk change from floppy_ready\n"); if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) && disk_change(current_drive) && !drive_params[current_drive].select_delay) - twaddle(); /* this clears the dcl on certain + twaddle(current_fdc, current_drive); /* this clears the dcl on certain * drive/controller combinations */ #ifdef fd_chose_dma_mode @@ -1946,20 +1950,20 @@ static void floppy_ready(void) #endif if (raw_cmd->flags & (FD_RAW_NEED_SEEK | FD_RAW_NEED_DISK)) { - perpendicular_mode(); - fdc_specify(); /* must be done here because of hut, hlt ... */ + perpendicular_mode(current_fdc); + fdc_specify(current_fdc, current_drive); /* must be done here because of hut, hlt ... */ seek_floppy(); } else { if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) - fdc_specify(); + fdc_specify(current_fdc, current_drive); setup_rw_floppy(); } } static void floppy_start(void) { - reschedule_timeout(current_reqD, "floppy start"); + reschedule_timeout(current_drive, "floppy start"); scandrives(); debug_dcl(drive_params[current_drive].flags, @@ -2004,6 +2008,9 @@ static const struct cont_t intr_cont = { .done = (done_f)empty }; +/* schedules handler, waiting for completion. May be interrupted, will then + * return -EINTR, in which case the driver will automatically be unlocked. + */ static int wait_til_done(void (*handler)(void), bool interruptible) { int ret; @@ -2059,18 +2066,19 @@ static void success_and_wakeup(void) * ========================== */ -static int next_valid_format(void) +static int next_valid_format(int drive) { int probed_format; - probed_format = drive_state[current_drive].probed_format; + probed_format = drive_state[drive].probed_format; while (1) { - if (probed_format >= 8 || !drive_params[current_drive].autodetect[probed_format]) { - drive_state[current_drive].probed_format = 0; + if (probed_format >= FD_AUTODETECT_SIZE || + !drive_params[drive].autodetect[probed_format]) { + drive_state[drive].probed_format = 0; return 1; } - if (floppy_type[drive_params[current_drive].autodetect[probed_format]].sect) { - drive_state[current_drive].probed_format = probed_format; + if (floppy_type[drive_params[drive].autodetect[probed_format]].sect) { + drive_state[drive].probed_format = probed_format; return 0; } probed_format++; @@ -2083,7 +2091,7 @@ static void bad_flp_intr(void) if (probing) { drive_state[current_drive].probed_format++; - if (!next_valid_format()) + if (!next_valid_format(current_drive)) return; } err_count = ++(*errors); @@ -2843,6 +2851,9 @@ static int set_next_request(void) return current_req != NULL; } +/* Starts or continues processing request. Will automatically unlock the + * driver at end of request. + */ static void redo_fd_request(void) { int drive; @@ -2867,7 +2878,7 @@ do_request: } drive = (long)current_req->rq_disk->private_data; set_fdc(drive); - reschedule_timeout(current_reqD, "redo fd request"); + reschedule_timeout(current_drive, "redo fd request"); set_floppy(drive); raw_cmd = &default_raw_cmd; @@ -2885,7 +2896,7 @@ do_request: if (!_floppy) { /* Autodetection */ if (!probing) { drive_state[current_drive].probed_format = 0; - if (next_valid_format()) { + if (next_valid_format(current_drive)) { DPRINT("no autodetectable formats\n"); _floppy = NULL; request_done(0); @@ -2904,7 +2915,7 @@ do_request: } if (test_bit(FD_NEED_TWADDLE_BIT, &drive_state[current_drive].flags)) - twaddle(); + twaddle(current_fdc, current_drive); schedule_bh(floppy_start); debugt(__func__, "queue fd request"); return; @@ -2917,6 +2928,7 @@ static const struct cont_t rw_cont = { .done = request_done }; +/* schedule the request and automatically unlock the driver on completion */ static void process_fd_request(void) { cont = &rw_cont; @@ -2938,17 +2950,17 @@ static blk_status_t floppy_queue_rq(struct blk_mq_hw_ctx *hctx, (unsigned long long) current_req->cmd_flags)) return BLK_STS_IOERR; - spin_lock_irq(&floppy_lock); - list_add_tail(&bd->rq->queuelist, &floppy_reqs); - spin_unlock_irq(&floppy_lock); - if (test_and_set_bit(0, &fdc_busy)) { /* fdc busy, this new request will be treated when the current one is done */ is_alive(__func__, "old request running"); - return BLK_STS_OK; + return BLK_STS_RESOURCE; } + spin_lock_irq(&floppy_lock); + list_add_tail(&bd->rq->queuelist, &floppy_reqs); + spin_unlock_irq(&floppy_lock); + command_status = FD_COMMAND_NONE; __reschedule_timeout(MAXTIMEOUT, "fd_request"); set_fdc(0); @@ -2996,6 +3008,10 @@ static const struct cont_t reset_cont = { .done = generic_done }; +/* + * Resets the FDC connected to drive <drive>. + * Both current_drive and current_fdc are changed to match the new drive. + */ static int user_reset_fdc(int drive, int arg, bool interruptible) { int ret; @@ -3006,6 +3022,9 @@ static int user_reset_fdc(int drive, int arg, bool interruptible) if (arg == FD_RESET_ALWAYS) fdc_state[current_fdc].reset = 1; if (fdc_state[current_fdc].reset) { + /* note: reset_fdc will take care of unlocking the driver + * on completion. + */ cont = &reset_cont; ret = wait_til_done(reset_fdc, interruptible); if (ret == -EINTR) @@ -3059,7 +3078,7 @@ static void raw_cmd_done(int flag) raw_cmd->flags |= FD_RAW_HARDFAILURE; } else { raw_cmd->reply_count = inr; - if (raw_cmd->reply_count > MAX_REPLIES) + if (raw_cmd->reply_count > FD_RAW_REPLY_SIZE) raw_cmd->reply_count = 0; for (i = 0; i < raw_cmd->reply_count; i++) raw_cmd->reply[i] = reply_buffer[i]; @@ -3170,18 +3189,10 @@ loop: if (ret) return -EFAULT; param += sizeof(struct floppy_raw_cmd); - if (ptr->cmd_count > 33) - /* the command may now also take up the space - * initially intended for the reply & the - * reply count. Needed for long 82078 commands - * such as RESTORE, which takes ... 17 command - * bytes. Murphy's law #137: When you reserve - * 16 bytes for a structure, you'll one day - * discover that you really need 17... - */ + if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE) return -EINVAL; - for (i = 0; i < 16; i++) + for (i = 0; i < FD_RAW_REPLY_SIZE; i++) ptr->reply[i] = 0; ptr->resultcode = 0; @@ -3423,13 +3434,13 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool valid_floppy_drive_params(const short autodetect[8], +static bool valid_floppy_drive_params(const short autodetect[FD_AUTODETECT_SIZE], int native_format) { size_t floppy_type_size = ARRAY_SIZE(floppy_type); size_t i = 0; - for (i = 0; i < 8; ++i) { + for (i = 0; i < FD_AUTODETECT_SIZE; ++i) { if (autodetect[i] < 0 || autodetect[i] >= floppy_type_size) return false; @@ -3610,7 +3621,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int case FDTWADDLE: if (lock_fdc(drive)) return -EINTR; - twaddle(); + twaddle(current_fdc, current_drive); process_fd_request(); return 0; default: @@ -3654,7 +3665,7 @@ struct compat_floppy_drive_params { struct floppy_max_errors max_errors; char flags; char read_track; - short autodetect[8]; + short autodetect[FD_AUTODETECT_SIZE]; compat_int_t checkfreq; compat_int_t native_format; }; @@ -4298,79 +4309,79 @@ static const struct block_device_operations floppy_fops = { /* Determine the floppy disk controller type */ /* This routine was written by David C. Niemi */ -static char __init get_fdc_version(void) +static char __init get_fdc_version(int fdc) { int r; - output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */ - if (fdc_state[current_fdc].reset) + output_byte(fdc, FD_DUMPREGS); /* 82072 and better know DUMPREGS */ + if (fdc_state[fdc].reset) return FDC_NONE; - r = result(); + r = result(fdc); if (r <= 0x00) return FDC_NONE; /* No FDC present ??? */ if ((r == 1) && (reply_buffer[0] == 0x80)) { - pr_info("FDC %d is an 8272A\n", current_fdc); + pr_info("FDC %d is an 8272A\n", fdc); return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ } if (r != 10) { pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n", - current_fdc, r); + fdc, r); return FDC_UNKNOWN; } - if (!fdc_configure()) { - pr_info("FDC %d is an 82072\n", current_fdc); + if (!fdc_configure(fdc)) { + pr_info("FDC %d is an 82072\n", fdc); return FDC_82072; /* 82072 doesn't know CONFIGURE */ } - output_byte(FD_PERPENDICULAR); - if (need_more_output() == MORE_OUTPUT) { - output_byte(0); + output_byte(fdc, FD_PERPENDICULAR); + if (need_more_output(fdc) == MORE_OUTPUT) { + output_byte(fdc, 0); } else { - pr_info("FDC %d is an 82072A\n", current_fdc); + pr_info("FDC %d is an 82072A\n", fdc); return FDC_82072A; /* 82072A as found on Sparcs. */ } - output_byte(FD_UNLOCK); - r = result(); + output_byte(fdc, FD_UNLOCK); + r = result(fdc); if ((r == 1) && (reply_buffer[0] == 0x80)) { - pr_info("FDC %d is a pre-1991 82077\n", current_fdc); + pr_info("FDC %d is a pre-1991 82077\n", fdc); return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know * LOCK/UNLOCK */ } if ((r != 1) || (reply_buffer[0] != 0x00)) { pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", - current_fdc, r); + fdc, r); return FDC_UNKNOWN; } - output_byte(FD_PARTID); - r = result(); + output_byte(fdc, FD_PARTID); + r = result(fdc); if (r != 1) { pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n", - current_fdc, r); + fdc, r); return FDC_UNKNOWN; } if (reply_buffer[0] == 0x80) { - pr_info("FDC %d is a post-1991 82077\n", current_fdc); + pr_info("FDC %d is a post-1991 82077\n", fdc); return FDC_82077; /* Revised 82077AA passes all the tests */ } switch (reply_buffer[0] >> 5) { case 0x0: /* Either a 82078-1 or a 82078SL running at 5Volt */ - pr_info("FDC %d is an 82078.\n", current_fdc); + pr_info("FDC %d is an 82078.\n", fdc); return FDC_82078; case 0x1: - pr_info("FDC %d is a 44pin 82078\n", current_fdc); + pr_info("FDC %d is a 44pin 82078\n", fdc); return FDC_82078; case 0x2: - pr_info("FDC %d is a S82078B\n", current_fdc); + pr_info("FDC %d is a S82078B\n", fdc); return FDC_S82078B; case 0x3: - pr_info("FDC %d is a National Semiconductor PC87306\n", current_fdc); + pr_info("FDC %d is a National Semiconductor PC87306\n", fdc); return FDC_87306; default: pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n", - current_fdc, reply_buffer[0] >> 5); + fdc, reply_buffer[0] >> 5); return FDC_82078_UNKN; } } /* get_fdc_version */ @@ -4534,11 +4545,13 @@ static void floppy_device_release(struct device *dev) static int floppy_resume(struct device *dev) { int fdc; + int saved_drive; + saved_drive = current_drive; for (fdc = 0; fdc < N_FDC; fdc++) if (fdc_state[fdc].address != -1) - user_reset_fdc(-1, FD_RESET_ALWAYS, false); - + user_reset_fdc(REVDRIVE(fdc, 0), FD_RESET_ALWAYS, false); + set_fdc(saved_drive); return 0; } @@ -4646,16 +4659,15 @@ static int __init do_floppy_init(void) config_types(); for (i = 0; i < N_FDC; i++) { - current_fdc = i; - memset(&fdc_state[current_fdc], 0, sizeof(*fdc_state)); - fdc_state[current_fdc].dtr = -1; - fdc_state[current_fdc].dor = 0x4; + memset(&fdc_state[i], 0, sizeof(*fdc_state)); + fdc_state[i].dtr = -1; + fdc_state[i].dor = 0x4; #if defined(__sparc__) || defined(__mc68000__) /*sparcs/sun3x don't have a DOR reset which we can fall back on to */ #ifdef __mc68000__ if (MACH_IS_SUN3X) #endif - fdc_state[current_fdc].version = FDC_82072A; + fdc_state[i].version = FDC_82072A; #endif } @@ -4697,30 +4709,29 @@ static int __init do_floppy_init(void) msleep(10); for (i = 0; i < N_FDC; i++) { - current_fdc = i; - fdc_state[current_fdc].driver_version = FD_DRIVER_VERSION; + fdc_state[i].driver_version = FD_DRIVER_VERSION; for (unit = 0; unit < 4; unit++) - fdc_state[current_fdc].track[unit] = 0; - if (fdc_state[current_fdc].address == -1) + fdc_state[i].track[unit] = 0; + if (fdc_state[i].address == -1) continue; - fdc_state[current_fdc].rawcmd = 2; - if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) { + fdc_state[i].rawcmd = 2; + if (user_reset_fdc(REVDRIVE(i, 0), FD_RESET_ALWAYS, false)) { /* free ioports reserved by floppy_grab_irq_and_dma() */ - floppy_release_regions(current_fdc); - fdc_state[current_fdc].address = -1; - fdc_state[current_fdc].version = FDC_NONE; + floppy_release_regions(i); + fdc_state[i].address = -1; + fdc_state[i].version = FDC_NONE; continue; } /* Try to determine the floppy controller type */ - fdc_state[current_fdc].version = get_fdc_version(); - if (fdc_state[current_fdc].version == FDC_NONE) { + fdc_state[i].version = get_fdc_version(i); + if (fdc_state[i].version == FDC_NONE) { /* free ioports reserved by floppy_grab_irq_and_dma() */ - floppy_release_regions(current_fdc); - fdc_state[current_fdc].address = -1; + floppy_release_regions(i); + fdc_state[i].address = -1; continue; } if (can_use_virtual_dma == 2 && - fdc_state[current_fdc].version < FDC_82072A) + fdc_state[i].version < FDC_82072A) can_use_virtual_dma = 0; have_no_fdc = 0; @@ -4728,7 +4739,7 @@ static int __init do_floppy_init(void) * properly, so force a reset for the standard FDC clones, * to avoid interrupt garbage. */ - user_reset_fdc(-1, FD_RESET_ALWAYS, false); + user_reset_fdc(REVDRIVE(i, 0), FD_RESET_ALWAYS, false); } current_fdc = 0; cancel_delayed_work(&fd_timeout); @@ -4855,6 +4866,8 @@ static void floppy_release_regions(int fdc) static int floppy_grab_irq_and_dma(void) { + int fdc; + if (atomic_inc_return(&usage_count) > 1) return 0; @@ -4882,24 +4895,24 @@ static int floppy_grab_irq_and_dma(void) } } - for (current_fdc = 0; current_fdc < N_FDC; current_fdc++) { - if (fdc_state[current_fdc].address != -1) { - if (floppy_request_regions(current_fdc)) + for (fdc = 0; fdc < N_FDC; fdc++) { + if (fdc_state[fdc].address != -1) { + if (floppy_request_regions(fdc)) goto cleanup; } } - for (current_fdc = 0; current_fdc < N_FDC; current_fdc++) { - if (fdc_state[current_fdc].address != -1) { - reset_fdc_info(1); - fdc_outb(fdc_state[current_fdc].dor, current_fdc, FD_DOR); + for (fdc = 0; fdc < N_FDC; fdc++) { + if (fdc_state[fdc].address != -1) { + reset_fdc_info(fdc, 1); + fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR); } } - current_fdc = 0; + set_dor(0, ~0, 8); /* avoid immediate interrupt */ - for (current_fdc = 0; current_fdc < N_FDC; current_fdc++) - if (fdc_state[current_fdc].address != -1) - fdc_outb(fdc_state[current_fdc].dor, current_fdc, FD_DOR); + for (fdc = 0; fdc < N_FDC; fdc++) + if (fdc_state[fdc].address != -1) + fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR); /* * The driver will try and free resources and relies on us * to know if they were allocated or not. @@ -4910,15 +4923,16 @@ static int floppy_grab_irq_and_dma(void) cleanup: fd_free_irq(); fd_free_dma(); - while (--current_fdc >= 0) - floppy_release_regions(current_fdc); + while (--fdc >= 0) + floppy_release_regions(fdc); + current_fdc = 0; atomic_dec(&usage_count); return -1; } static void floppy_release_irq_and_dma(void) { - int old_fdc; + int fdc; #ifndef __sparc__ int drive; #endif @@ -4959,11 +4973,9 @@ static void floppy_release_irq_and_dma(void) pr_info("auxiliary floppy timer still active\n"); if (work_pending(&floppy_work)) pr_info("work still pending\n"); - old_fdc = current_fdc; - for (current_fdc = 0; current_fdc < N_FDC; current_fdc++) - if (fdc_state[current_fdc].address != -1) - floppy_release_regions(current_fdc); - current_fdc = old_fdc; + for (fdc = 0; fdc < N_FDC; fdc++) + if (fdc_state[fdc].address != -1) + floppy_release_regions(fdc); } #ifdef MODULE diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..475e1a738560 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -228,26 +228,36 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) blk_mq_unfreeze_queue(lo->lo_queue); } +/** + * loop_validate_block_size() - validates the passed in block size + * @bsize: size to validate + */ static int -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) +loop_validate_block_size(unsigned short bsize) { - loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); - sector_t x = (sector_t)size; - struct block_device *bdev = lo->lo_device; + if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + return -EINVAL; - if (unlikely((loff_t)x != size)) - return -EFBIG; - if (lo->lo_offset != offset) - lo->lo_offset = offset; - if (lo->lo_sizelimit != sizelimit) - lo->lo_sizelimit = sizelimit; - set_capacity(lo->lo_disk, x); - bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); return 0; } +/** + * loop_set_size() - sets device size and notifies userspace + * @lo: struct loop_device to set the size for + * @size: new size of the loop device + * + * Callers must validate that the size passed into this function fits into + * a sector_t, eg using loop_validate_size() + */ +static void loop_set_size(struct loop_device *lo, loff_t size) +{ + struct block_device *bdev = lo->lo_device; + + bd_set_size(bdev, size << SECTOR_SHIFT); + + set_capacity_revalidate_and_notify(lo->lo_disk, size, false); +} + static inline int lo_do_transfer(struct loop_device *lo, int cmd, struct page *rpage, unsigned roffs, @@ -634,8 +644,8 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) static inline void loop_update_dio(struct loop_device *lo) { - __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | - lo->use_dio); + __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | + lo->use_dio); } static void loop_reread_partitions(struct loop_device *lo, @@ -919,7 +929,7 @@ static void loop_unprepare_queue(struct loop_device *lo) static int loop_kthread_worker_fn(void *worker_ptr) { - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; return kthread_worker_fn(worker_ptr); } @@ -952,23 +962,125 @@ static void loop_update_rotational(struct loop_device *lo) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); } -static int loop_set_fd(struct loop_device *lo, fmode_t mode, - struct block_device *bdev, unsigned int arg) +static int +loop_release_xfer(struct loop_device *lo) +{ + int err = 0; + struct loop_func_table *xfer = lo->lo_encryption; + + if (xfer) { + if (xfer->release) + err = xfer->release(lo); + lo->transfer = NULL; + lo->lo_encryption = NULL; + module_put(xfer->owner); + } + return err; +} + +static int +loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, + const struct loop_info64 *i) +{ + int err = 0; + + if (xfer) { + struct module *owner = xfer->owner; + + if (!try_module_get(owner)) + return -EINVAL; + if (xfer->init) + err = xfer->init(lo, i); + if (err) + module_put(owner); + else + lo->lo_encryption = xfer; + } + return err; +} + +/** + * loop_set_status_from_info - configure device from loop_info + * @lo: struct loop_device to configure + * @info: struct loop_info64 to configure the device with + * + * Configures the loop device parameters according to the passed + * in loop_info64 configuration. + */ +static int +loop_set_status_from_info(struct loop_device *lo, + const struct loop_info64 *info) +{ + int err; + struct loop_func_table *xfer; + kuid_t uid = current_uid(); + + if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) + return -EINVAL; + + err = loop_release_xfer(lo); + if (err) + return err; + + if (info->lo_encrypt_type) { + unsigned int type = info->lo_encrypt_type; + + if (type >= MAX_LO_CRYPT) + return -EINVAL; + xfer = xfer_funcs[type]; + if (xfer == NULL) + return -EINVAL; + } else + xfer = NULL; + + err = loop_init_xfer(lo, xfer, info); + if (err) + return err; + + lo->lo_offset = info->lo_offset; + lo->lo_sizelimit = info->lo_sizelimit; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); + memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); + lo->lo_file_name[LO_NAME_SIZE-1] = 0; + lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; + + if (!xfer) + xfer = &none_funcs; + lo->transfer = xfer->transfer; + lo->ioctl = xfer->ioctl; + + lo->lo_flags = info->lo_flags; + + lo->lo_encrypt_key_size = info->lo_encrypt_key_size; + lo->lo_init[0] = info->lo_init[0]; + lo->lo_init[1] = info->lo_init[1]; + if (info->lo_encrypt_key_size) { + memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, + info->lo_encrypt_key_size); + lo->lo_key_owner = uid; + } + + return 0; +} + +static int loop_configure(struct loop_device *lo, fmode_t mode, + struct block_device *bdev, + const struct loop_config *config) { struct file *file; struct inode *inode; struct address_space *mapping; struct block_device *claimed_bdev = NULL; - int lo_flags = 0; int error; loff_t size; bool partscan; + unsigned short bsize; /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); error = -EBADF; - file = fget(arg); + file = fget(config->fd); if (!file) goto out; @@ -977,7 +1089,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bd_start_claiming(bdev, loop_set_fd); + claimed_bdev = bd_start_claiming(bdev, loop_configure); if (IS_ERR(claimed_bdev)) { error = PTR_ERR(claimed_bdev); goto out_putf; @@ -999,52 +1111,58 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping = file->f_mapping; inode = mapping->host; + size = get_loop_size(lo, file); + + if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { + error = -EINVAL; + goto out_unlock; + } + + if (config->block_size) { + error = loop_validate_block_size(config->block_size); + if (error) + goto out_unlock; + } + + error = loop_set_status_from_info(lo, &config->info); + if (error) + goto out_unlock; + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || !file->f_op->write_iter) - lo_flags |= LO_FLAGS_READ_ONLY; + lo->lo_flags |= LO_FLAGS_READ_ONLY; - error = -EFBIG; - size = get_loop_size(lo, file); - if ((loff_t)(sector_t)size != size) - goto out_unlock; error = loop_prepare_queue(lo); if (error) goto out_unlock; - error = 0; - - set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + set_device_ro(bdev, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - lo->use_dio = false; + lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; - lo->lo_flags = lo_flags; lo->lo_backing_file = file; - lo->transfer = NULL; - lo->ioctl = NULL; - lo->lo_sizelimit = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) + if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); - if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + if (config->block_size) + bsize = config->block_size; + else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) /* In case of direct I/O, match underlying block size */ - unsigned short bsize = bdev_logical_block_size( - inode->i_sb->s_bdev); + bsize = bdev_logical_block_size(inode->i_sb->s_bdev); + else + bsize = 512; - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); - } + blk_queue_logical_block_size(lo->lo_queue, bsize); + blk_queue_physical_block_size(lo->lo_queue, bsize); + blk_queue_io_min(lo->lo_queue, bsize); loop_update_rotational(lo); loop_update_dio(lo); - set_capacity(lo->lo_disk, size); - bd_set_size(bdev, size << 9); loop_sysfs_init(lo); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); + loop_set_size(lo, size); set_blocksize(bdev, S_ISBLK(inode->i_mode) ? block_size(inode->i_bdev) : PAGE_SIZE); @@ -1062,14 +1180,14 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (partscan) loop_reread_partitions(lo, bdev); if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); + bd_abort_claiming(bdev, claimed_bdev, loop_configure); return 0; out_unlock: mutex_unlock(&loop_ctl_mutex); out_bdev: if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); + bd_abort_claiming(bdev, claimed_bdev, loop_configure); out_putf: fput(file); out: @@ -1078,43 +1196,6 @@ out: return error; } -static int -loop_release_xfer(struct loop_device *lo) -{ - int err = 0; - struct loop_func_table *xfer = lo->lo_encryption; - - if (xfer) { - if (xfer->release) - err = xfer->release(lo); - lo->transfer = NULL; - lo->lo_encryption = NULL; - module_put(xfer->owner); - } - return err; -} - -static int -loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, - const struct loop_info64 *i) -{ - int err = 0; - - if (xfer) { - struct module *owner = xfer->owner; - - if (!try_module_get(owner)) - return -EINVAL; - if (xfer->init) - err = xfer->init(lo, i); - if (err) - module_put(owner); - else - lo->lo_encryption = xfer; - } - return err; -} - static int __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp = NULL; @@ -1263,10 +1344,11 @@ static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - struct loop_func_table *xfer; - kuid_t uid = current_uid(); struct block_device *bdev; + kuid_t uid = current_uid(); + int prev_lo_flags; bool partscan = false; + bool size_changed = false; err = mutex_lock_killable(&loop_ctl_mutex); if (err) @@ -1281,93 +1363,55 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) err = -ENXIO; goto out_unlock; } - if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) { - err = -EINVAL; - goto out_unlock; - } if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { + size_changed = true; sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); + invalidate_bdev(lo->lo_device); } /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - err = loop_release_xfer(lo); - if (err) + if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { + /* If any pages were dirtied after invalidate_bdev(), try again */ + err = -EAGAIN; + pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", + __func__, lo->lo_number, lo->lo_file_name, + lo->lo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; + } - if (info->lo_encrypt_type) { - unsigned int type = info->lo_encrypt_type; - - if (type >= MAX_LO_CRYPT) { - err = -EINVAL; - goto out_unfreeze; - } - xfer = xfer_funcs[type]; - if (xfer == NULL) { - err = -EINVAL; - goto out_unfreeze; - } - } else - xfer = NULL; + prev_lo_flags = lo->lo_flags; - err = loop_init_xfer(lo, xfer, info); + err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { - /* kill_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { - err = -EFBIG; - goto out_unfreeze; - } + /* Mask out flags that can't be set using LOOP_SET_STATUS. */ + lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; + /* For those flags, use the previous values instead */ + lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; + /* For flags that can't be cleared, use previous values too */ + lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + + if (size_changed) { + loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, + lo->lo_backing_file); + loop_set_size(lo, new_size); } loop_config_discard(lo); - memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); - memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); - lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; - - if (!xfer) - xfer = &none_funcs; - lo->transfer = xfer->transfer; - lo->ioctl = xfer->ioctl; - - if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != - (info->lo_flags & LO_FLAGS_AUTOCLEAR)) - lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; - - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; - lo->lo_init[0] = info->lo_init[0]; - lo->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_key_size) { - memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, - info->lo_encrypt_key_size); - lo->lo_key_owner = uid; - } - /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); - if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && + !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; bdev = lo->lo_device; partscan = true; @@ -1531,10 +1575,15 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { static int loop_set_capacity(struct loop_device *lo) { + loff_t size; + if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); + size = get_loop_size(lo, lo->lo_backing_file); + loop_set_size(lo, size); + + return 0; } static int loop_set_dio(struct loop_device *lo, unsigned long arg) @@ -1558,18 +1607,19 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) if (lo->lo_state != Lo_bound) return -ENXIO; - if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) - return -EINVAL; + err = loop_validate_block_size(arg); + if (err) + return err; if (lo->lo_queue->limits.logical_block_size == arg) return 0; sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); + invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - /* kill_bdev should have truncated all the pages */ + /* invalidate_bdev should have truncated all the pages */ if (lo->lo_device->bd_inode->i_mapping->nrpages) { err = -EAGAIN; pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", @@ -1617,11 +1667,31 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; + void __user *argp = (void __user *) arg; int err; switch (cmd) { - case LOOP_SET_FD: - return loop_set_fd(lo, mode, bdev, arg); + case LOOP_SET_FD: { + /* + * Legacy case - pass in a zeroed out struct loop_config with + * only the file descriptor set , which corresponds with the + * default parameters we'd have used otherwise. + */ + struct loop_config config; + + memset(&config, 0, sizeof(config)); + config.fd = arg; + + return loop_configure(lo, mode, bdev, &config); + } + case LOOP_CONFIGURE: { + struct loop_config config; + + if (copy_from_user(&config, argp, sizeof(config))) + return -EFAULT; + + return loop_configure(lo, mode, bdev, &config); + } case LOOP_CHANGE_FD: return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: @@ -1629,21 +1699,19 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_SET_STATUS: err = -EPERM; if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { - err = loop_set_status_old(lo, - (struct loop_info __user *)arg); + err = loop_set_status_old(lo, argp); } break; case LOOP_GET_STATUS: - return loop_get_status_old(lo, (struct loop_info __user *) arg); + return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { - err = loop_set_status64(lo, - (struct loop_info64 __user *) arg); + err = loop_set_status64(lo, argp); } break; case LOOP_GET_STATUS64: - return loop_get_status64(lo, (struct loop_info64 __user *) arg); + return loop_get_status64(lo, argp); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: case LOOP_SET_BLOCK_SIZE: @@ -1795,6 +1863,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: + case LOOP_CONFIGURE: arg = (unsigned long) compat_ptr(arg); /* fall through */ case LOOP_SET_FD: @@ -2037,7 +2106,7 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 62b660821dbc..81b311c9d781 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -85,26 +85,35 @@ struct nullb { char disk_name[DISK_NAME_LEN]; }; +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors); + #ifdef CONFIG_BLK_DEV_ZONED -int null_zone_init(struct nullb_device *dev); -void null_zone_exit(struct nullb_device *dev); +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_register_zoned_dev(struct nullb *nullb); +void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_handle_zoned(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len); #else -static inline int null_zone_init(struct nullb_device *dev) +static inline int null_init_zoned_dev(struct nullb_device *dev, + struct request_queue *q) { pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); return -EINVAL; } -static inline void null_zone_exit(struct nullb_device *dev) {} -static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors) +static inline int null_register_zoned_dev(struct nullb *nullb) +{ + return -ENODEV; +} +static inline void null_free_zoned_dev(struct nullb_device *dev) {} +static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, sector_t nr_sectors) { return BLK_STS_NOTSUPP; } diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 4e1c0712278e..87b31f9ca362 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -580,7 +580,7 @@ static void null_free_dev(struct nullb_device *dev) if (!dev) return; - null_zone_exit(dev); + null_free_zoned_dev(dev); badblocks_exit(&dev->badblocks); kfree(dev); } @@ -1250,8 +1250,34 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, return errno_to_blk_status(err); } +static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct bio *bio; + + if (dev->memory_backed) + return; + + if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { + zero_fill_bio(cmd->bio); + } else if (req_op(cmd->rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, cmd->rq) + zero_fill_bio(bio); + } +} + static inline void nullb_complete_cmd(struct nullb_cmd *cmd) { + /* + * Since root privileges are required to configure the null_blk + * driver, it is fine that this driver does not initialize the + * data buffers of read commands. Zero-initialize these buffers + * anyway if KMSAN is enabled to prevent that KMSAN complains + * about null_blk not initializing read data buffers. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + nullb_zero_read_cmd_buffer(cmd); + /* Complete IO by inline, softirq or timer */ switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: @@ -1276,6 +1302,25 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) } } +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + blk_status_t ret; + + if (dev->badblocks.shift != -1) { + ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + } + + if (dev->memory_backed) + return null_handle_memory_backed(cmd, op); + + return BLK_STS_OK; +} + static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, sector_t nr_sectors, enum req_opf op) { @@ -1294,17 +1339,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, goto out; } - if (nullb->dev->badblocks.shift != -1) { - cmd->error = null_handle_badblocks(cmd, sector, nr_sectors); - if (cmd->error != BLK_STS_OK) - goto out; - } - - if (dev->memory_backed) - cmd->error = null_handle_memory_backed(cmd, op); - - if (!cmd->error && dev->zoned) - cmd->error = null_handle_zoned(cmd, op, sector, nr_sectors); + if (dev->zoned) + cmd->error = null_process_zoned_cmd(cmd, op, + sector, nr_sectors); + else + cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); out: nullb_complete_cmd(cmd); @@ -1384,7 +1423,7 @@ static bool should_requeue_request(struct request *rq) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { pr_info("rq %p timed out\n", rq); - blk_mq_complete_request(rq); + blk_mq_force_complete_rq(rq); return BLK_EH_DONE; } @@ -1522,6 +1561,13 @@ static void null_config_discard(struct nullb *nullb) { if (nullb->dev->discard == false) return; + + if (nullb->dev->zoned) { + nullb->dev->discard = false; + pr_info("discard option is ignored in zoned mode\n"); + return; + } + nullb->q->limits.discard_granularity = nullb->dev->blocksize; nullb->q->limits.discard_alignment = nullb->dev->blocksize; blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); @@ -1605,19 +1651,12 @@ static int null_gendisk_register(struct nullb *nullb) disk->queue = nullb->q; strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); -#ifdef CONFIG_BLK_DEV_ZONED if (nullb->dev->zoned) { - if (queue_is_mq(nullb->q)) { - int ret = blk_revalidate_disk_zones(disk); - if (ret) - return ret; - } else { - blk_queue_chunk_sectors(nullb->q, - nullb->dev->zone_size_sects); - nullb->q->nr_zones = blkdev_nr_zones(disk); - } + int ret = null_register_zoned_dev(nullb); + + if (ret) + return ret; } -#endif add_disk(disk); return 0; @@ -1773,14 +1812,9 @@ static int null_add_dev(struct nullb_device *dev) } if (dev->zoned) { - rv = null_zone_init(dev); + rv = null_init_zoned_dev(dev, nullb->q); if (rv) goto out_cleanup_blk_queue; - - nullb->q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); - blk_queue_required_elevator_features(nullb->q, - ELEVATOR_F_ZBD_SEQ_WRITE); } nullb->q->queuedata = nullb; @@ -1809,8 +1843,7 @@ static int null_add_dev(struct nullb_device *dev) return 0; out_cleanup_zone: - if (dev->zoned) - null_zone_exit(dev); + null_free_zoned_dev(dev); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 673618d8222a..cc47606d8ffe 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -13,7 +13,7 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } -int null_zone_init(struct nullb_device *dev) +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { sector_t dev_size = (sector_t)dev->size * 1024 * 1024; sector_t sector = 0; @@ -23,6 +23,10 @@ int null_zone_init(struct nullb_device *dev) pr_err("zone_size must be power-of-two\n"); return -EINVAL; } + if (dev->zone_size > dev->size) { + pr_err("Zone size larger than device capacity\n"); + return -EINVAL; + } dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; dev->nr_zones = dev_size >> @@ -61,10 +65,34 @@ int null_zone_init(struct nullb_device *dev) sector += dev->zone_size_sects; } + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + + return 0; +} + +int null_register_zoned_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + struct request_queue *q = nullb->q; + + if (queue_is_mq(q)) { + int ret = blk_revalidate_disk_zones(nullb->disk, NULL); + + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(q, dev->zone_size_sects); + q->nr_zones = blkdev_nr_zones(nullb->disk); + } + + blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); + return 0; } -void null_zone_exit(struct nullb_device *dev) +void null_free_zoned_dev(struct nullb_device *dev) { kvfree(dev->zones); } @@ -121,41 +149,57 @@ size_t null_zone_valid_read_len(struct nullb *nullb, } static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors) + unsigned int nr_sectors, bool append) { struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); struct blk_zone *zone = &dev->zones[zno]; + blk_status_t ret; + + trace_nullb_zone_op(cmd, zno, zone->cond); + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); switch (zone->cond) { case BLK_ZONE_COND_FULL: /* Cannot write to a full zone */ - cmd->error = BLK_STS_IOERR; return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: - /* Writes must be at the write pointer position */ - if (sector != zone->wp) + /* + * Regular writes must be at the write pointer position. + * Zone append writes are automatically issued at the write + * pointer and the position returned using the request or BIO + * sector. + */ + if (append) { + sector = zone->wp; + if (cmd->bio) + cmd->bio->bi_iter.bi_sector = sector; + else + cmd->rq->__sector = sector; + } else if (sector != zone->wp) { return BLK_STS_IOERR; + } if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + zone->wp += nr_sectors; if (zone->wp == zone->start + zone->len) zone->cond = BLK_ZONE_COND_FULL; - break; - case BLK_ZONE_COND_NOT_WP: - break; + return BLK_STS_OK; default: /* Invalid zone condition */ return BLK_STS_IOERR; } - - trace_nullb_zone_op(cmd, zno, zone->cond); - return BLK_STS_OK; } static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, @@ -216,12 +260,14 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, return BLK_STS_OK; } -blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector, sector_t nr_sectors) +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) { switch (op) { case REQ_OP_WRITE: - return null_zone_write(cmd, sector, nr_sectors); + return null_zone_write(cmd, sector, nr_sectors, false); + case REQ_OP_ZONE_APPEND: + return null_zone_write(cmd, sector, nr_sectors, true); case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: @@ -229,6 +275,6 @@ blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, case REQ_OP_ZONE_FINISH: return null_zone_mgmt(cmd, op, sector); default: - return BLK_STS_OK; + return null_process_cmd(cmd, op, sector, nr_sectors); } } diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig index f8bd6ef3605a..7c6ae1036927 100644 --- a/drivers/block/paride/Kconfig +++ b/drivers/block/paride/Kconfig @@ -28,7 +28,7 @@ config PARIDE_PCD depends on PARIDE select CDROM select BLK_SCSI_REQUEST # only for the generic cdrom code - ---help--- + help This option enables the high-level driver for ATAPI CD-ROM devices connected through a parallel port. If you chose to build PARIDE support into your kernel, you may answer Y here to build in the @@ -71,7 +71,7 @@ config PARIDE_PT config PARIDE_PG tristate "Parallel port generic ATAPI devices" depends on PARIDE - ---help--- + help This option enables a special high-level driver for generic ATAPI devices connected through a parallel port. The driver allows user programs, such as cdrtools, to send ATAPI commands directly to a @@ -111,7 +111,7 @@ config PARIDE_ATEN config PARIDE_BPCK tristate "MicroSolutions backpack (Series 5) protocol" depends on PARIDE - ---help--- + help This option enables support for the Micro Solutions BACKPACK parallel port Series 5 IDE protocol. (Most BACKPACK drives made before 1999 were Series 5) Series 5 drives will NOT always have the @@ -129,7 +129,7 @@ config PARIDE_BPCK config PARIDE_BPCK6 tristate "MicroSolutions backpack (Series 6) protocol" depends on PARIDE && !64BIT - ---help--- + help This option enables support for the Micro Solutions BACKPACK parallel port Series 6 IDE protocol. (Most BACKPACK drives made after 1999 were Series 6) Series 6 drives will have the Series noted diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index cda5cf917e9a..5124eca90e83 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -1032,7 +1032,7 @@ static int __init pcd_init(void) for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { if (cd->present) { - register_cdrom(&cd->info); + register_cdrom(cd->disk, &cd->info); cd->disk->private_data = cd; add_disk(cd->disk); } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 0b944ac96d6b..27a33adc41e4 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1613,7 +1613,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, disc_information di; track_information ti; __u32 last_track; - int ret = -1; + int ret; ret = pkt_get_disc_info(pd, &di); if (ret) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c5c6487a19d5..7b55811c2a81 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -454,7 +454,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) queue->queuedata = dev; blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); - blk_queue_segment_boundary(queue, -1UL); blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e0a6b19ae0d..4f61e9209461 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -836,6 +836,7 @@ enum { Opt_lock_timeout, /* int args above */ Opt_pool_ns, + Opt_compression_hint, /* string args above */ Opt_read_only, Opt_read_write, @@ -844,8 +845,23 @@ enum { Opt_notrim, }; +enum { + Opt_compression_hint_none, + Opt_compression_hint_compressible, + Opt_compression_hint_incompressible, +}; + +static const struct constant_table rbd_param_compression_hint[] = { + {"none", Opt_compression_hint_none}, + {"compressible", Opt_compression_hint_compressible}, + {"incompressible", Opt_compression_hint_incompressible}, + {} +}; + static const struct fs_parameter_spec rbd_parameters[] = { fsparam_u32 ("alloc_size", Opt_alloc_size), + fsparam_enum ("compression_hint", Opt_compression_hint, + rbd_param_compression_hint), fsparam_flag ("exclusive", Opt_exclusive), fsparam_flag ("lock_on_read", Opt_lock_on_read), fsparam_u32 ("lock_timeout", Opt_lock_timeout), @@ -867,6 +883,8 @@ struct rbd_options { bool lock_on_read; bool exclusive; bool trim; + + u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ }; #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ @@ -1433,8 +1451,10 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) static void rbd_osd_format_read(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_request = osd_req->r_priv; + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; + struct ceph_options *opt = rbd_dev->rbd_client->client->options; - osd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica; osd_req->r_snapid = obj_request->img_request->snap_id; } @@ -2253,7 +2273,8 @@ static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { osd_req_op_alloc_hint_init(osd_req, which++, rbd_dev->layout.object_size, - rbd_dev->layout.object_size); + rbd_dev->layout.object_size, + rbd_dev->opts->alloc_hint_flags); } if (rbd_obj_is_entire(obj_req)) @@ -3754,11 +3775,7 @@ static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, static void rbd_notify_op_lock(struct rbd_device *rbd_dev, enum rbd_notify_op notify_op) { - struct page **reply_pages; - size_t reply_len; - - __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); - ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); + __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL); } static void rbd_notify_acquired_lock(struct work_struct *work) @@ -4527,6 +4544,10 @@ static void cancel_tasks_sync(struct rbd_device *rbd_dev) cancel_work_sync(&rbd_dev->unlock_work); } +/* + * header_rwsem must not be held to avoid a deadlock with + * rbd_dev_refresh() when flushing notifies. + */ static void rbd_unregister_watch(struct rbd_device *rbd_dev) { cancel_tasks_sync(rbd_dev); @@ -6331,6 +6352,29 @@ static int rbd_parse_param(struct fs_parameter *param, pctx->spec->pool_ns = param->string; param->string = NULL; break; + case Opt_compression_hint: + switch (result.uint_32) { + case Opt_compression_hint_none: + opt->alloc_hint_flags &= + ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE | + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE); + break; + case Opt_compression_hint_compressible: + opt->alloc_hint_flags |= + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; + opt->alloc_hint_flags &= + ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; + break; + case Opt_compression_hint_incompressible: + opt->alloc_hint_flags |= + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; + opt->alloc_hint_flags &= + ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; + break; + default: + BUG(); + } + break; case Opt_read_only: opt->read_only = true; break; @@ -6894,9 +6938,10 @@ static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - rbd_dev_unprobe(rbd_dev); - if (rbd_dev->opts) + if (!rbd_is_ro(rbd_dev)) rbd_unregister_watch(rbd_dev); + + rbd_dev_unprobe(rbd_dev); rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -6907,6 +6952,9 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) * device. If this image is the one being mapped (i.e., not a * parent), initiate a watch on its header object before using that * object to get detailed information about the rbd image. + * + * On success, returns with header_rwsem held for write if called + * with @depth == 0. */ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) { @@ -6936,11 +6984,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) } } + if (!depth) + down_write(&rbd_dev->header_rwsem); + ret = rbd_dev_header_info(rbd_dev); if (ret) { if (ret == -ENOENT && !need_watch) rbd_print_dne(rbd_dev, false); - goto err_out_watch; + goto err_out_probe; } /* @@ -6985,10 +7036,11 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) return 0; err_out_probe: - rbd_dev_unprobe(rbd_dev); -err_out_watch: + if (!depth) + up_write(&rbd_dev->header_rwsem); if (need_watch) rbd_unregister_watch(rbd_dev); + rbd_dev_unprobe(rbd_dev); err_out_format: rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); @@ -7050,12 +7102,9 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_rbd_dev; } - down_write(&rbd_dev->header_rwsem); rc = rbd_dev_image_probe(rbd_dev, 0); - if (rc < 0) { - up_write(&rbd_dev->header_rwsem); + if (rc < 0) goto err_out_rbd_dev; - } if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { rbd_warn(rbd_dev, "alloc_size adjusted to %u", diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index ac98ab6ccd3b..a600e0eb6b6f 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -93,7 +93,7 @@ struct rbd_image_header_ondisk { __le32 snap_count; __le32 reserved; __le64 snap_names_len; - struct rbd_image_snap_ondisk snaps[0]; + struct rbd_image_snap_ondisk snaps[]; } __attribute__((packed)); diff --git a/drivers/block/rnbd/Kconfig b/drivers/block/rnbd/Kconfig new file mode 100644 index 000000000000..4b6d3d816d1f --- /dev/null +++ b/drivers/block/rnbd/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config BLK_DEV_RNBD + bool + +config BLK_DEV_RNBD_CLIENT + tristate "RDMA Network Block Device driver client" + depends on INFINIBAND_RTRS_CLIENT + select BLK_DEV_RNBD + help + RNBD client is a network block device driver using rdma transport. + + RNBD client allows for mapping of a remote block devices over + RTRS protocol from a target system where RNBD server is running. + + If unsure, say N. + +config BLK_DEV_RNBD_SERVER + tristate "RDMA Network Block Device driver server" + depends on INFINIBAND_RTRS_SERVER + select BLK_DEV_RNBD + help + RNBD server is the server side of RNBD using rdma transport. + + RNBD server allows for exporting local block devices to a remote client + over RTRS protocol. + + If unsure, say N. diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile new file mode 100644 index 000000000000..5bb1a7ad1ada --- /dev/null +++ b/drivers/block/rnbd/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs + +rnbd-client-y := rnbd-clt.o \ + rnbd-clt-sysfs.o \ + rnbd-common.o + +rnbd-server-y := rnbd-common.o \ + rnbd-srv.o \ + rnbd-srv-dev.o \ + rnbd-srv-sysfs.o + +obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o +obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o diff --git a/drivers/block/rnbd/README b/drivers/block/rnbd/README new file mode 100644 index 000000000000..1773c0aa0bd4 --- /dev/null +++ b/drivers/block/rnbd/README @@ -0,0 +1,92 @@ +******************************** +RDMA Network Block Device (RNBD) +******************************** + +Introduction +------------ + +RNBD (RDMA Network Block Device) is a pair of kernel modules +(client and server) that allow for remote access of a block device on +the server over RTRS protocol using the RDMA (InfiniBand, RoCE, iWARP) +transport. After being mapped, the remote block devices can be accessed +on the client side as local block devices. + +I/O is transferred between client and server by the RTRS transport +modules. The administration of RNBD and RTRS modules is done via +sysfs entries. + +Requirements +------------ + + RTRS kernel modules + +Quick Start +----------- + +Server side: + # modprobe rnbd_server + +Client side: + # modprobe rnbd_client + # echo "sessname=blya path=ip:10.50.100.66 device_path=/dev/ram0" > \ + /sys/devices/virtual/rnbd-client/ctl/map_device + + Where "sessname=" is a session name, a string to identify the session + on client and on server sides; "path=" is a destination IP address or + a pair of a source and a destination IPs, separated by comma. Multiple + "path=" options can be specified in order to use multipath (see RTRS + description for details); "device_path=" is the block device to be + mapped from the server side. After the session to the server machine is + established, the mapped device will appear on the client side under + /dev/rnbd<N>. + + +RNBD-Server Module Parameters +============================= + +dev_search_path +--------------- + +When a device is mapped from the client, the server generates the path +to the block device on the server side by concatenating dev_search_path +and the "device_path" that was specified in the map_device operation. + +The default dev_search_path is: "/". + +dev_search_path option can also contain %SESSNAME% in order to provide +different device namespaces for different sessions. See "device_path" +option for details. + +============================ +Protocol (rnbd/rnbd-proto.h) +============================ + +1. Before mapping first device from a given server, client sends an +RNBD_MSG_SESS_INFO to the server. Server responds with +RNBD_MSG_SESS_INFO_RSP. Currently the messages only contain the protocol +version for backward compatibility. + +2. Client requests to open a device by sending RNBD_MSG_OPEN message. This +contains the path to the device and access mode (read-only or writable). +Server responds to the message with RNBD_MSG_OPEN_RSP. This contains +a 32 bit device id to be used for IOs and device "geometry" related +information: side, max_hw_sectors, etc. + +3. Client attaches RNBD_MSG_IO to each IO message send to a device. This +message contains device id, provided by server in his rnbd_msg_open_rsp, +sector to be accessed, read-write flags and bi_size. + +4. Client closes a device by sending RNBD_MSG_CLOSE which contains only the +device id provided by the server. + +========================================= +Contributors List(in alphabetical order) +========================================= +Danil Kipnis <danil.kipnis@profitbricks.com> +Fabian Holler <mail@fholler.de> +Guoqing Jiang <guoqing.jiang@cloud.ionos.com> +Jack Wang <jinpu.wang@profitbricks.com> +Kleber Souza <kleber.souza@profitbricks.com> +Lutz Pogrell <lutz.pogrell@cloud.ionos.com> +Milind Dumbare <Milind.dumbare@gmail.com> +Roman Penyaev <roman.penyaev@profitbricks.com> diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c new file mode 100644 index 000000000000..4f4474eecadb --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/in6.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/device.h> +#include <rdma/ib.h> +#include <rdma/rdma_cm.h> + +#include "rnbd-clt.h" + +static struct device *rnbd_dev; +static struct class *rnbd_dev_class; +static struct kobject *rnbd_devs_kobj; + +enum { + RNBD_OPT_ERR = 0, + RNBD_OPT_DEST_PORT = 1 << 0, + RNBD_OPT_PATH = 1 << 1, + RNBD_OPT_DEV_PATH = 1 << 2, + RNBD_OPT_ACCESS_MODE = 1 << 3, + RNBD_OPT_SESSNAME = 1 << 6, +}; + +static const unsigned int rnbd_opt_mandatory[] = { + RNBD_OPT_PATH, + RNBD_OPT_DEV_PATH, + RNBD_OPT_SESSNAME, +}; + +static const match_table_t rnbd_opt_tokens = { + {RNBD_OPT_PATH, "path=%s" }, + {RNBD_OPT_DEV_PATH, "device_path=%s"}, + {RNBD_OPT_DEST_PORT, "dest_port=%d" }, + {RNBD_OPT_ACCESS_MODE, "access_mode=%s"}, + {RNBD_OPT_SESSNAME, "sessname=%s" }, + {RNBD_OPT_ERR, NULL }, +}; + +struct rnbd_map_options { + char *sessname; + struct rtrs_addr *paths; + size_t *path_cnt; + char *pathname; + u16 *dest_port; + enum rnbd_access_mode *access_mode; +}; + +static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, + struct rnbd_map_options *opt) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i, dest_port; + int p_cnt = 0; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rnbd_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case RNBD_OPT_SESSNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("map_device: sessname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strlcpy(opt->sessname, p, NAME_MAX); + kfree(p); + break; + + case RNBD_OPT_PATH: + if (p_cnt >= max_path_cnt) { + pr_err("map_device: too many (> %zu) paths provided\n", + max_path_cnt); + ret = -ENOMEM; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + ret = rtrs_addr_to_sockaddr(p, strlen(p), + *opt->dest_port, + &opt->paths[p_cnt]); + if (ret) { + pr_err("Can't parse path %s: %d\n", p, ret); + kfree(p); + goto out; + } + + p_cnt++; + + kfree(p); + break; + + case RNBD_OPT_DEV_PATH: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("map_device: Device path too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strlcpy(opt->pathname, p, NAME_MAX); + kfree(p); + break; + + case RNBD_OPT_DEST_PORT: + if (match_int(args, &dest_port) || dest_port < 0 || + dest_port > 65535) { + pr_err("bad destination port number parameter '%d'\n", + dest_port); + ret = -EINVAL; + goto out; + } + *opt->dest_port = dest_port; + break; + + case RNBD_OPT_ACCESS_MODE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "ro")) { + *opt->access_mode = RNBD_ACCESS_RO; + } else if (!strcmp(p, "rw")) { + *opt->access_mode = RNBD_ACCESS_RW; + } else if (!strcmp(p, "migration")) { + *opt->access_mode = RNBD_ACCESS_MIGRATION; + } else { + pr_err("map_device: Invalid access_mode: '%s'\n", + p); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + + default: + pr_err("map_device: Unknown parameter or missing value '%s'\n", + p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rnbd_opt_mandatory); i++) { + if ((opt_mask & rnbd_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("map_device: Parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + *opt->path_cnt = p_cnt; + kfree(options); + return ret; +} + +static ssize_t state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + switch (dev->dev_state) { + case DEV_STATE_INIT: + return snprintf(page, PAGE_SIZE, "init\n"); + case DEV_STATE_MAPPED: + /* TODO fix cli tool before changing to proper state */ + return snprintf(page, PAGE_SIZE, "open\n"); + case DEV_STATE_MAPPED_DISCONNECTED: + /* TODO fix cli tool before changing to proper state */ + return snprintf(page, PAGE_SIZE, "closed\n"); + case DEV_STATE_UNMAPPED: + return snprintf(page, PAGE_SIZE, "unmapped\n"); + default: + return snprintf(page, PAGE_SIZE, "unknown\n"); + } +} + +static struct kobj_attribute rnbd_clt_state_attr = __ATTR_RO(state); + +static ssize_t mapping_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname); +} + +static struct kobj_attribute rnbd_clt_mapping_path_attr = + __ATTR_RO(mapping_path); + +static ssize_t access_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return snprintf(page, PAGE_SIZE, "%s\n", + rnbd_access_mode_str(dev->access_mode)); +} + +static struct kobj_attribute rnbd_clt_access_mode = + __ATTR_RO(access_mode); + +static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo <normal|force> > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + char *opt, *options; + bool force; + int err; + + opt = kstrdup(buf, GFP_KERNEL); + if (!opt) + return -ENOMEM; + + options = strstrip(opt); + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + if (sysfs_streq(options, "normal")) { + force = false; + } else if (sysfs_streq(options, "force")) { + force = true; + } else { + rnbd_clt_err(dev, + "unmap_device: Invalid value: %s\n", + options); + err = -EINVAL; + goto out; + } + + rnbd_clt_info(dev, "Unmapping device, option: %s.\n", + force ? "force" : "normal"); + + /* + * We take explicit module reference only for one reason: do not + * race with lockless rnbd_destroy_sessions(). + */ + if (!try_module_get(THIS_MODULE)) { + err = -ENODEV; + goto out; + } + err = rnbd_clt_unmap_device(dev, force, &attr->attr); + if (err) { + if (err != -EALREADY) + rnbd_clt_err(dev, "unmap_device: %d\n", err); + goto module_put; + } + + /* + * Here device can be vanished! + */ + + err = count; + +module_put: + module_put(THIS_MODULE); +out: + kfree(opt); + + return err; +} + +static struct kobj_attribute rnbd_clt_unmap_device_attr = + __ATTR(unmap_device, 0644, rnbd_clt_unmap_dev_show, + rnbd_clt_unmap_dev_store); + +static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo <new size in sectors> > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long sectors; + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + ret = kstrtoul(buf, 0, §ors); + if (ret) + return ret; + + ret = rnbd_clt_resize_disk(dev, (size_t)sectors); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute rnbd_clt_resize_dev_attr = + __ATTR(resize, 0644, rnbd_clt_resize_dev_show, + rnbd_clt_resize_dev_store); + +static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + char *opt, *options; + int err; + + opt = kstrdup(buf, GFP_KERNEL); + if (!opt) + return -ENOMEM; + + options = strstrip(opt); + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + if (!sysfs_streq(options, "1")) { + rnbd_clt_err(dev, + "remap_device: Invalid value: %s\n", + options); + err = -EINVAL; + goto out; + } + err = rnbd_clt_remap_device(dev); + if (likely(!err)) + err = count; + +out: + kfree(opt); + + return err; +} + +static struct kobj_attribute rnbd_clt_remap_device_attr = + __ATTR(remap_device, 0644, rnbd_clt_remap_dev_show, + rnbd_clt_remap_dev_store); + +static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname); +} + +static struct kobj_attribute rnbd_clt_session_attr = + __ATTR_RO(session); + +static struct attribute *rnbd_dev_attrs[] = { + &rnbd_clt_unmap_device_attr.attr, + &rnbd_clt_resize_dev_attr.attr, + &rnbd_clt_remap_device_attr.attr, + &rnbd_clt_mapping_path_attr.attr, + &rnbd_clt_state_attr.attr, + &rnbd_clt_session_attr.attr, + &rnbd_clt_access_mode.attr, + NULL, +}; + +void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) +{ + /* + * The module unload rnbd_client_exit path is racing with unmapping of + * the last single device from the sysfs manually + * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because + * of sysfs link already was removed already. + */ + if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { + sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + module_put(THIS_MODULE); + } +} + +static struct kobj_type rnbd_dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = rnbd_dev_attrs, +}; + +static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) +{ + int ret; + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + + ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s", + "rnbd"); + if (ret) + rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n", + ret); + + return ret; +} + +static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"[dest_port=server port number] sessname=<name of the rtrs session> path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path=<full path on remote side> [access_mode=<ro|rw|migration>]\" > %s\n\naddr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n", + attr->attr.name); +} + +static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, + size_t len) +{ + int ret; + char pathname[NAME_MAX], *s; + + strlcpy(pathname, dev->pathname, sizeof(pathname)); + while ((s = strchr(pathname, '/'))) + s[0] = '!'; + + ret = snprintf(buf, len, "%s", pathname); + if (ret >= len) + return -ENAMETOOLONG; + + return 0; +} + +static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) +{ + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + int ret; + + ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, + sizeof(dev->blk_symlink_name)); + if (ret) { + rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n", + ret); + goto out_err; + } + + ret = sysfs_create_link(rnbd_devs_kobj, gd_kobj, + dev->blk_symlink_name); + if (ret) { + rnbd_clt_err(dev, "Creating /sys/block symlink failed, err: %d\n", + ret); + goto out_err; + } + + return 0; + +out_err: + dev->blk_symlink_name[0] = '\0'; + return ret; +} + +static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + struct rnbd_map_options opt; + int ret; + char pathname[NAME_MAX]; + char sessname[NAME_MAX]; + enum rnbd_access_mode access_mode = RNBD_ACCESS_RW; + u16 port_nr = RTRS_PORT; + + struct sockaddr_storage *addrs; + struct rtrs_addr paths[6]; + size_t path_cnt; + + opt.sessname = sessname; + opt.paths = paths; + opt.path_cnt = &path_cnt; + opt.pathname = pathname; + opt.dest_port = &port_nr; + opt.access_mode = &access_mode; + addrs = kcalloc(ARRAY_SIZE(paths) * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &addrs[path_cnt * 2]; + paths[path_cnt].dst = &addrs[path_cnt * 2 + 1]; + } + + ret = rnbd_clt_parse_map_options(buf, ARRAY_SIZE(paths), &opt); + if (ret) + goto out; + + pr_info("Mapping device %s on session %s, (access_mode: %s)\n", + pathname, sessname, + rnbd_access_mode_str(access_mode)); + + dev = rnbd_clt_map_device(sessname, paths, path_cnt, port_nr, pathname, + access_mode); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + + ret = rnbd_clt_add_dev_kobj(dev); + if (ret) + goto unmap_dev; + + ret = rnbd_clt_add_dev_symlink(dev); + if (ret) + goto unmap_dev; + + kfree(addrs); + return count; + +unmap_dev: + rnbd_clt_unmap_device(dev, true, NULL); +out: + kfree(addrs); + return ret; +} + +static struct kobj_attribute rnbd_clt_map_device_attr = + __ATTR(map_device, 0644, + rnbd_clt_map_device_show, rnbd_clt_map_device_store); + +static struct attribute *default_attrs[] = { + &rnbd_clt_map_device_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +static const struct attribute_group *default_attr_groups[] = { + &default_attr_group, + NULL, +}; + +int rnbd_clt_create_sysfs_files(void) +{ + int err; + + rnbd_dev_class = class_create(THIS_MODULE, "rnbd-client"); + if (IS_ERR(rnbd_dev_class)) + return PTR_ERR(rnbd_dev_class); + + rnbd_dev = device_create_with_groups(rnbd_dev_class, NULL, + MKDEV(0, 0), NULL, + default_attr_groups, "ctl"); + if (IS_ERR(rnbd_dev)) { + err = PTR_ERR(rnbd_dev); + goto cls_destroy; + } + rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj); + if (!rnbd_devs_kobj) { + err = -ENOMEM; + goto dev_destroy; + } + + return 0; + +dev_destroy: + device_destroy(rnbd_dev_class, MKDEV(0, 0)); +cls_destroy: + class_destroy(rnbd_dev_class); + + return err; +} + +void rnbd_clt_destroy_default_group(void) +{ + sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group); +} + +void rnbd_clt_destroy_sysfs_files(void) +{ + kobject_del(rnbd_devs_kobj); + kobject_put(rnbd_devs_kobj); + device_destroy(rnbd_dev_class, MKDEV(0, 0)); + class_destroy(rnbd_dev_class); +} diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c new file mode 100644 index 000000000000..cc6a4e2587ae --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt.c @@ -0,0 +1,1729 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/scatterlist.h> +#include <linux/idr.h> + +#include "rnbd-clt.h" + +MODULE_DESCRIPTION("RDMA Network Block Device Client"); +MODULE_LICENSE("GPL"); + +static int rnbd_client_major; +static DEFINE_IDA(index_ida); +static DEFINE_MUTEX(ida_lock); +static DEFINE_MUTEX(sess_lock); +static LIST_HEAD(sess_list); + +/* + * Maximum number of partitions an instance can have. + * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself) + */ +#define RNBD_PART_BITS 6 + +static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess) +{ + return refcount_inc_not_zero(&sess->refcount); +} + +static void free_sess(struct rnbd_clt_session *sess); + +static void rnbd_clt_put_sess(struct rnbd_clt_session *sess) +{ + might_sleep(); + + if (refcount_dec_and_test(&sess->refcount)) + free_sess(sess); +} + +static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) +{ + might_sleep(); + + if (!refcount_dec_and_test(&dev->refcount)) + return; + + mutex_lock(&ida_lock); + ida_simple_remove(&index_ida, dev->clt_device_id); + mutex_unlock(&ida_lock); + kfree(dev->hw_queues); + rnbd_clt_put_sess(dev->sess); + mutex_destroy(&dev->lock); + kfree(dev); +} + +static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) +{ + return refcount_inc_not_zero(&dev->refcount); +} + +static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, + const struct rnbd_msg_open_rsp *rsp) +{ + struct rnbd_clt_session *sess = dev->sess; + + if (!rsp->logical_block_size) + return -EINVAL; + + dev->device_id = le32_to_cpu(rsp->device_id); + dev->nsectors = le64_to_cpu(rsp->nsectors); + dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); + dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); + dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors); + dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); + dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); + dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); + dev->secure_discard = le16_to_cpu(rsp->secure_discard); + dev->rotational = rsp->rotational; + + dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; + dev->max_segments = BMAX_SEGMENTS; + + dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors, + le32_to_cpu(rsp->max_hw_sectors)); + dev->max_segments = min_t(u16, dev->max_segments, + le16_to_cpu(rsp->max_segments)); + + return 0; +} + +static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, + size_t new_nsectors) +{ + int err = 0; + + rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", + dev->nsectors, new_nsectors); + dev->nsectors = new_nsectors; + set_capacity(dev->gd, dev->nsectors); + err = revalidate_disk(dev->gd); + if (err) + rnbd_clt_err(dev, + "Failed to change device size from %zu to %zu, err: %d\n", + dev->nsectors, new_nsectors, err); + return err; +} + +static int process_msg_open_rsp(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) +{ + int err = 0; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_UNMAPPED) { + rnbd_clt_info(dev, + "Ignoring Open-Response message from server for unmapped device\n"); + err = -ENOENT; + goto out; + } + if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { + u64 nsectors = le64_to_cpu(rsp->nsectors); + + /* + * If the device was remapped and the size changed in the + * meantime we need to revalidate it + */ + if (dev->nsectors != nsectors) + rnbd_clt_change_capacity(dev, nsectors); + rnbd_clt_info(dev, "Device online, device remapped successfully\n"); + } + err = rnbd_clt_set_dev_attr(dev, rsp); + if (err) + goto out; + dev->dev_state = DEV_STATE_MAPPED; + +out: + mutex_unlock(&dev->lock); + + return err; +} + +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) +{ + int ret = 0; + + mutex_lock(&dev->lock); + if (dev->dev_state != DEV_STATE_MAPPED) { + pr_err("Failed to set new size of the device, device is not opened\n"); + ret = -ENOENT; + goto out; + } + ret = rnbd_clt_change_capacity(dev, newsize); + +out: + mutex_unlock(&dev->lock); + + return ret; +} + +static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q) +{ + if (WARN_ON(!q->hctx)) + return; + + /* We can come here from interrupt, thus async=true */ + blk_mq_run_hw_queue(q->hctx, true); +} + +enum { + RNBD_DELAY_IFBUSY = -1, +}; + +/** + * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun + * @sess: Session to find a queue for + * @cpu: Cpu to start the search from + * + * Description: + * Each CPU has a list of HW queues, which needs to be rerun. If a list + * is not empty - it is marked with a bit. This function finds first + * set bit in a bitmap and returns corresponding CPU list. + */ +static struct rnbd_cpu_qlist * +rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) +{ + int bit; + + /* Search from cpu to nr_cpu_ids */ + bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu); + if (bit < nr_cpu_ids) { + return per_cpu_ptr(sess->cpu_queues, bit); + } else if (cpu != 0) { + /* Search from 0 to cpu */ + bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + if (bit < cpu) + return per_cpu_ptr(sess->cpu_queues, bit); + } + + return NULL; +} + +static inline int nxt_cpu(int cpu) +{ + return (cpu + 1) % nr_cpu_ids; +} + +/** + * rnbd_rerun_if_needed() - rerun next queue marked as stopped + * @sess: Session to rerun a queue on + * + * Description: + * Each CPU has it's own list of HW queues, which should be rerun. + * Function finds such list with HW queues, takes a list lock, picks up + * the first HW queue out of the list and requeues it. + * + * Return: + * True if the queue was requeued, false otherwise. + * + * Context: + * Does not matter. + */ +static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess) +{ + struct rnbd_queue *q = NULL; + struct rnbd_cpu_qlist *cpu_q; + unsigned long flags; + int *cpup; + + /* + * To keep fairness and not to let other queues starve we always + * try to wake up someone else in round-robin manner. That of course + * increases latency but queues always have a chance to be executed. + */ + cpup = get_cpu_ptr(sess->cpu_rr); + for (cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q; + cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) { + if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) + continue; + if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm))) + goto unlock; + q = list_first_entry_or_null(&cpu_q->requeue_list, + typeof(*q), requeue_list); + if (WARN_ON(!q)) + goto clear_bit; + list_del_init(&q->requeue_list); + clear_bit_unlock(0, &q->in_list); + + if (list_empty(&cpu_q->requeue_list)) { + /* Clear bit if nothing is left */ +clear_bit: + clear_bit(cpu_q->cpu, sess->cpu_queues_bm); + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + + if (q) + break; + } + + /** + * Saves the CPU that is going to be requeued on the per-cpu var. Just + * incrementing it doesn't work because rnbd_get_cpu_qlist() will + * always return the first CPU with something on the queue list when the + * value stored on the var is greater than the last CPU with something + * on the list. + */ + if (cpu_q) + *cpup = cpu_q->cpu; + put_cpu_var(sess->cpu_rr); + + if (q) + rnbd_clt_dev_requeue(q); + + return q; +} + +/** + * rnbd_rerun_all_if_idle() - rerun all queues left in the list if + * session is idling (there are no requests + * in-flight). + * @sess: Session to rerun the queues on + * + * Description: + * This function tries to rerun all stopped queues if there are no + * requests in-flight anymore. This function tries to solve an obvious + * problem, when number of tags < than number of queues (hctx), which + * are stopped and put to sleep. If last permit, which has been just put, + * does not wake up all left queues (hctxs), IO requests hang forever. + * + * That can happen when all number of permits, say N, have been exhausted + * from one CPU, and we have many block devices per session, say M. + * Each block device has it's own queue (hctx) for each CPU, so eventually + * we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids. + * If number of permits N < M x nr_cpu_ids finally we will get an IO hang. + * + * To avoid this hang last caller of rnbd_put_permit() (last caller is the + * one who observes sess->busy == 0) must wake up all remaining queues. + * + * Context: + * Does not matter. + */ +static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess) +{ + bool requeued; + + do { + requeued = rnbd_rerun_if_needed(sess); + } while (atomic_read(&sess->busy) == 0 && requeued); +} + +static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess, + enum rtrs_clt_con_type con_type, + int wait) +{ + struct rtrs_permit *permit; + + permit = rtrs_clt_get_permit(sess->rtrs, con_type, + wait ? RTRS_PERMIT_WAIT : + RTRS_PERMIT_NOWAIT); + if (likely(permit)) + /* We have a subtle rare case here, when all permits can be + * consumed before busy counter increased. This is safe, + * because loser will get NULL as a permit, observe 0 busy + * counter and immediately restart the queue himself. + */ + atomic_inc(&sess->busy); + + return permit; +} + +static void rnbd_put_permit(struct rnbd_clt_session *sess, + struct rtrs_permit *permit) +{ + rtrs_clt_put_permit(sess->rtrs, permit); + atomic_dec(&sess->busy); + /* Paired with rnbd_clt_dev_add_to_requeue(). Decrement first + * and then check queue bits. + */ + smp_mb__after_atomic(); + rnbd_rerun_all_if_idle(sess); +} + +static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, + enum rtrs_clt_con_type con_type, + int wait) +{ + struct rnbd_iu *iu; + struct rtrs_permit *permit; + + permit = rnbd_get_permit(sess, con_type, + wait ? RTRS_PERMIT_WAIT : + RTRS_PERMIT_NOWAIT); + if (unlikely(!permit)) + return NULL; + iu = rtrs_permit_to_pdu(permit); + iu->permit = permit; + /* + * 1st reference is dropped after finishing sending a "user" message, + * 2nd reference is dropped after confirmation with the response is + * returned. + * 1st and 2nd can happen in any order, so the rnbd_iu should be + * released (rtrs_permit returned to ibbtrs) only leased after both + * are finished. + */ + atomic_set(&iu->refcount, 2); + init_waitqueue_head(&iu->comp.wait); + iu->comp.errno = INT_MAX; + + return iu; +} + +static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) +{ + if (atomic_dec_and_test(&iu->refcount)) + rnbd_put_permit(sess, iu->permit); +} + +static void rnbd_softirq_done_fn(struct request *rq) +{ + struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_iu *iu; + + iu = blk_mq_rq_to_pdu(rq); + rnbd_put_permit(sess, iu->permit); + blk_mq_end_request(rq, errno_to_blk_status(iu->errno)); +} + +static void msg_io_conf(void *priv, int errno) +{ + struct rnbd_iu *iu = priv; + struct rnbd_clt_dev *dev = iu->dev; + struct request *rq = iu->rq; + int rw = rq_data_dir(rq); + + iu->errno = errno; + + blk_mq_complete_request(rq); + + if (errno) + rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n", + rw == READ ? "read" : "write", errno); +} + +static void wake_up_iu_comp(struct rnbd_iu *iu, int errno) +{ + iu->comp.errno = errno; + wake_up(&iu->comp.wait); +} + +static void msg_conf(void *priv, int errno) +{ + struct rnbd_iu *iu = priv; + + iu->errno = errno; + schedule_work(&iu->work); +} + +enum wait_type { + NO_WAIT = 0, + WAIT = 1 +}; + +static int send_usr_msg(struct rtrs_clt *rtrs, int dir, + struct rnbd_iu *iu, struct kvec *vec, size_t nr, + size_t len, struct scatterlist *sg, unsigned int sg_len, + void (*conf)(struct work_struct *work), + int *errno, enum wait_type wait) +{ + int err; + struct rtrs_clt_req_ops req_ops; + + INIT_WORK(&iu->work, conf); + req_ops = (struct rtrs_clt_req_ops) { + .priv = iu, + .conf_fn = msg_conf, + }; + err = rtrs_clt_request(dir, &req_ops, rtrs, iu->permit, + vec, nr, len, sg, sg_len); + if (!err && wait) { + wait_event(iu->comp.wait, iu->comp.errno != INT_MAX); + *errno = iu->comp.errno; + } else { + *errno = 0; + } + + return err; +} + +static void msg_close_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_clt_dev *dev = iu->dev; + + wake_up_iu_comp(iu, iu->errno); + rnbd_put_iu(dev->sess, iu); + rnbd_clt_put_dev(dev); +} + +static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_msg_close msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) + return -ENOMEM; + + iu->buf = NULL; + iu->dev = dev; + + sg_mark_end(&iu->sglist[0]); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_CLOSE); + msg.device_id = cpu_to_le32(device_id); + + WARN_ON(!rnbd_clt_get_dev(dev)); + err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 1, 0, NULL, 0, + msg_close_conf, &errno, wait); + if (err) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static void msg_open_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_msg_open_rsp *rsp = iu->buf; + struct rnbd_clt_dev *dev = iu->dev; + int errno = iu->errno; + + if (errno) { + rnbd_clt_err(dev, + "Opening failed, server responded: %d\n", + errno); + } else { + errno = process_msg_open_rsp(dev, rsp); + if (errno) { + u32 device_id = le32_to_cpu(rsp->device_id); + /* + * If server thinks its fine, but we fail to process + * then be nice and send a close to server. + */ + (void)send_msg_close(dev, device_id, NO_WAIT); + } + } + kfree(rsp); + wake_up_iu_comp(iu, errno); + rnbd_put_iu(dev->sess, iu); + rnbd_clt_put_dev(dev); +} + +static void msg_sess_info_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_msg_sess_info_rsp *rsp = iu->buf; + struct rnbd_clt_session *sess = iu->sess; + + if (!iu->errno) + sess->ver = min_t(u8, rsp->ver, RNBD_PROTO_VER_MAJOR); + + kfree(rsp); + wake_up_iu_comp(iu, iu->errno); + rnbd_put_iu(sess, iu); + rnbd_clt_put_sess(sess); +} + +static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_msg_open_rsp *rsp; + struct rnbd_msg_open msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + kfree(rsp); + return -ENOMEM; + } + + iu->buf = rsp; + iu->dev = dev; + + sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); + msg.access_mode = dev->access_mode; + strlcpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); + + WARN_ON(!rnbd_clt_get_dev(dev)); + err = send_usr_msg(sess->rtrs, READ, iu, + &vec, 1, sizeof(*rsp), iu->sglist, 1, + msg_open_conf, &errno, wait); + if (err) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + kfree(rsp); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) +{ + struct rnbd_msg_sess_info_rsp *rsp; + struct rnbd_msg_sess_info msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + kfree(rsp); + return -ENOMEM; + } + + iu->buf = rsp; + iu->sess = sess; + + sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO); + msg.ver = RNBD_PROTO_VER_MAJOR; + + if (!rnbd_clt_get_sess(sess)) { + /* + * That can happen only in one case, when RTRS has restablished + * the connection and link_ev() is called, but session is almost + * dead, last reference on session is put and caller is waiting + * for RTRS to close everything. + */ + err = -ENODEV; + goto put_iu; + } + err = send_usr_msg(sess->rtrs, READ, iu, + &vec, 1, sizeof(*rsp), iu->sglist, 1, + msg_sess_info_conf, &errno, wait); + if (err) { + rnbd_clt_put_sess(sess); +put_iu: + rnbd_put_iu(sess, iu); + kfree(rsp); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess) +{ + struct rnbd_clt_dev *dev; + + mutex_lock(&sess->lock); + list_for_each_entry(dev, &sess->devs_list, list) { + rnbd_clt_err(dev, "Device disconnected.\n"); + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_MAPPED) + dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED; + mutex_unlock(&dev->lock); + } + mutex_unlock(&sess->lock); +} + +static void remap_devs(struct rnbd_clt_session *sess) +{ + struct rnbd_clt_dev *dev; + struct rtrs_attrs attrs; + int err; + + /* + * Careful here: we are called from RTRS link event directly, + * thus we can't send any RTRS request and wait for response + * or RTRS will not be able to complete request with failure + * if something goes wrong (failing of outstanding requests + * happens exactly from the context where we are blocking now). + * + * So to avoid deadlocks each usr message sent from here must + * be asynchronous. + */ + + err = send_msg_sess_info(sess, NO_WAIT); + if (err) { + pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err); + return; + } + + rtrs_clt_query(sess->rtrs, &attrs); + mutex_lock(&sess->lock); + sess->max_io_size = attrs.max_io_size; + + list_for_each_entry(dev, &sess->devs_list, list) { + bool skip; + + mutex_lock(&dev->lock); + skip = (dev->dev_state == DEV_STATE_INIT); + mutex_unlock(&dev->lock); + if (skip) + /* + * When device is establishing connection for the first + * time - do not remap, it will be closed soon. + */ + continue; + + rnbd_clt_info(dev, "session reconnected, remapping device\n"); + err = send_msg_open(dev, NO_WAIT); + if (err) { + rnbd_clt_err(dev, "send_msg_open(): %d\n", err); + break; + } + } + mutex_unlock(&sess->lock); +} + +static void rnbd_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev) +{ + struct rnbd_clt_session *sess = priv; + + switch (ev) { + case RTRS_CLT_LINK_EV_DISCONNECTED: + set_dev_states_to_disconnected(sess); + break; + case RTRS_CLT_LINK_EV_RECONNECTED: + remap_devs(sess); + break; + default: + pr_err("Unknown session event received (%d), session: %s\n", + ev, sess->sessname); + } +} + +static void rnbd_init_cpu_qlists(struct rnbd_cpu_qlist __percpu *cpu_queues) +{ + unsigned int cpu; + struct rnbd_cpu_qlist *cpu_q; + + for_each_possible_cpu(cpu) { + cpu_q = per_cpu_ptr(cpu_queues, cpu); + + cpu_q->cpu = cpu; + INIT_LIST_HEAD(&cpu_q->requeue_list); + spin_lock_init(&cpu_q->requeue_lock); + } +} + +static void destroy_mq_tags(struct rnbd_clt_session *sess) +{ + if (sess->tag_set.tags) + blk_mq_free_tag_set(&sess->tag_set); +} + +static inline void wake_up_rtrs_waiters(struct rnbd_clt_session *sess) +{ + sess->rtrs_ready = true; + wake_up_all(&sess->rtrs_waitq); +} + +static void close_rtrs(struct rnbd_clt_session *sess) +{ + might_sleep(); + + if (!IS_ERR_OR_NULL(sess->rtrs)) { + rtrs_clt_close(sess->rtrs); + sess->rtrs = NULL; + wake_up_rtrs_waiters(sess); + } +} + +static void free_sess(struct rnbd_clt_session *sess) +{ + WARN_ON(!list_empty(&sess->devs_list)); + + might_sleep(); + + close_rtrs(sess); + destroy_mq_tags(sess); + if (!list_empty(&sess->list)) { + mutex_lock(&sess_lock); + list_del(&sess->list); + mutex_unlock(&sess_lock); + } + free_percpu(sess->cpu_queues); + free_percpu(sess->cpu_rr); + mutex_destroy(&sess->lock); + kfree(sess); +} + +static struct rnbd_clt_session *alloc_sess(const char *sessname) +{ + struct rnbd_clt_session *sess; + int err, cpu; + + sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); + if (!sess) + return ERR_PTR(-ENOMEM); + strlcpy(sess->sessname, sessname, sizeof(sess->sessname)); + atomic_set(&sess->busy, 0); + mutex_init(&sess->lock); + INIT_LIST_HEAD(&sess->devs_list); + INIT_LIST_HEAD(&sess->list); + bitmap_zero(sess->cpu_queues_bm, NR_CPUS); + init_waitqueue_head(&sess->rtrs_waitq); + refcount_set(&sess->refcount, 1); + + sess->cpu_queues = alloc_percpu(struct rnbd_cpu_qlist); + if (!sess->cpu_queues) { + err = -ENOMEM; + goto err; + } + rnbd_init_cpu_qlists(sess->cpu_queues); + + /* + * That is simple percpu variable which stores cpu indeces, which are + * incremented on each access. We need that for the sake of fairness + * to wake up queues in a round-robin manner. + */ + sess->cpu_rr = alloc_percpu(int); + if (!sess->cpu_rr) { + err = -ENOMEM; + goto err; + } + for_each_possible_cpu(cpu) + * per_cpu_ptr(sess->cpu_rr, cpu) = cpu; + + return sess; + +err: + free_sess(sess); + + return ERR_PTR(err); +} + +static int wait_for_rtrs_connection(struct rnbd_clt_session *sess) +{ + wait_event(sess->rtrs_waitq, sess->rtrs_ready); + if (IS_ERR_OR_NULL(sess->rtrs)) + return -ECONNRESET; + + return 0; +} + +static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess) + __releases(&sess_lock) + __acquires(&sess_lock) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&sess->rtrs_waitq, &wait, TASK_UNINTERRUPTIBLE); + if (IS_ERR_OR_NULL(sess->rtrs)) { + finish_wait(&sess->rtrs_waitq, &wait); + return; + } + mutex_unlock(&sess_lock); + /* loop in caller, see __find_and_get_sess(). + * You can't leave mutex locked and call schedule(), you will catch a + * deadlock with a caller of free_sess(), which has just put the last + * reference and is about to take the sess_lock in order to delete + * the session from the list. + */ + schedule(); + mutex_lock(&sess_lock); +} + +static struct rnbd_clt_session *__find_and_get_sess(const char *sessname) + __releases(&sess_lock) + __acquires(&sess_lock) +{ + struct rnbd_clt_session *sess, *sn; + int err; + +again: + list_for_each_entry_safe(sess, sn, &sess_list, list) { + if (strcmp(sessname, sess->sessname)) + continue; + + if (sess->rtrs_ready && IS_ERR_OR_NULL(sess->rtrs)) + /* + * No RTRS connection, session is dying. + */ + continue; + + if (rnbd_clt_get_sess(sess)) { + /* + * Alive session is found, wait for RTRS connection. + */ + mutex_unlock(&sess_lock); + err = wait_for_rtrs_connection(sess); + if (err) + rnbd_clt_put_sess(sess); + mutex_lock(&sess_lock); + + if (err) + /* Session is dying, repeat the loop */ + goto again; + + return sess; + } + /* + * Ref is 0, session is dying, wait for RTRS disconnect + * in order to avoid session names clashes. + */ + wait_for_rtrs_disconnection(sess); + /* + * RTRS is disconnected and soon session will be freed, + * so repeat a loop. + */ + goto again; + } + + return NULL; +} + +static struct +rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) +{ + struct rnbd_clt_session *sess = NULL; + + mutex_lock(&sess_lock); + sess = __find_and_get_sess(sessname); + if (!sess) { + sess = alloc_sess(sessname); + if (IS_ERR(sess)) { + mutex_unlock(&sess_lock); + return sess; + } + list_add(&sess->list, &sess_list); + *first = true; + } else + *first = false; + mutex_unlock(&sess_lock); + + return sess; +} + +static int rnbd_client_open(struct block_device *block_device, fmode_t mode) +{ + struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + + if (dev->read_only && (mode & FMODE_WRITE)) + return -EPERM; + + if (dev->dev_state == DEV_STATE_UNMAPPED || + !rnbd_clt_get_dev(dev)) + return -EIO; + + return 0; +} + +static void rnbd_client_release(struct gendisk *gen, fmode_t mode) +{ + struct rnbd_clt_dev *dev = gen->private_data; + + rnbd_clt_put_dev(dev); +} + +static int rnbd_client_getgeo(struct block_device *block_device, + struct hd_geometry *geo) +{ + u64 size; + struct rnbd_clt_dev *dev; + + dev = block_device->bd_disk->private_data; + size = dev->size * (dev->logical_block_size / SECTOR_SIZE); + geo->cylinders = size >> 6; /* size/64 */ + geo->heads = 4; + geo->sectors = 16; + geo->start = 0; + + return 0; +} + +static const struct block_device_operations rnbd_client_ops = { + .owner = THIS_MODULE, + .open = rnbd_client_open, + .release = rnbd_client_release, + .getgeo = rnbd_client_getgeo +}; + +/* The amount of data that belongs to an I/O and the amount of data that + * should be read or written to the disk (bi_size) can differ. + * + * E.g. When WRITE_SAME is used, only a small amount of data is + * transferred that is then written repeatedly over a lot of sectors. + * + * Get the size of data to be transferred via RTRS by summing up the size + * of the scather-gather list entries. + */ +static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len) +{ + struct scatterlist *sg; + size_t tsize = 0; + int i; + + for_each_sg(sglist, sg, len, i) + tsize += sg->length; + return tsize; +} + +static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, + struct request *rq, + struct rnbd_iu *iu) +{ + struct rtrs_clt *rtrs = dev->sess->rtrs; + struct rtrs_permit *permit = iu->permit; + struct rnbd_msg_io msg; + struct rtrs_clt_req_ops req_ops; + unsigned int sg_cnt = 0; + struct kvec vec; + size_t size; + int err; + + iu->rq = rq; + iu->dev = dev; + msg.sector = cpu_to_le64(blk_rq_pos(rq)); + msg.bi_size = cpu_to_le32(blk_rq_bytes(rq)); + msg.rw = cpu_to_le32(rq_to_rnbd_flags(rq)); + msg.prio = cpu_to_le16(req_get_ioprio(rq)); + + /* + * We only support discards with single segment for now. + * See queue limits. + */ + if (req_op(rq) != REQ_OP_DISCARD) + sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist); + + if (sg_cnt == 0) + /* Do not forget to mark the end */ + sg_mark_end(&iu->sglist[0]); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_IO); + msg.device_id = cpu_to_le32(dev->device_id); + + vec = (struct kvec) { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + size = rnbd_clt_get_sg_size(iu->sglist, sg_cnt); + req_ops = (struct rtrs_clt_req_ops) { + .priv = iu, + .conf_fn = msg_io_conf, + }; + err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit, + &vec, 1, size, iu->sglist, sg_cnt); + if (unlikely(err)) { + rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n", + err); + return err; + } + + return 0; +} + +/** + * rnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy + * @dev: Device to be checked + * @q: Queue to be added to the requeue list if required + * + * Description: + * If session is busy, that means someone will requeue us when resources + * are freed. If session is not doing anything - device is not added to + * the list and @false is returned. + */ +static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev, + struct rnbd_queue *q) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_cpu_qlist *cpu_q; + unsigned long flags; + bool added = true; + bool need_set; + + cpu_q = get_cpu_ptr(sess->cpu_queues); + spin_lock_irqsave(&cpu_q->requeue_lock, flags); + + if (likely(!test_and_set_bit_lock(0, &q->in_list))) { + if (WARN_ON(!list_empty(&q->requeue_list))) + goto unlock; + + need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm); + if (need_set) { + set_bit(cpu_q->cpu, sess->cpu_queues_bm); + /* Paired with rnbd_put_permit(). Set a bit first + * and then observe the busy counter. + */ + smp_mb__before_atomic(); + } + if (likely(atomic_read(&sess->busy))) { + list_add_tail(&q->requeue_list, &cpu_q->requeue_list); + } else { + /* Very unlikely, but possible: busy counter was + * observed as zero. Drop all bits and return + * false to restart the queue by ourselves. + */ + if (need_set) + clear_bit(cpu_q->cpu, sess->cpu_queues_bm); + clear_bit_unlock(0, &q->in_list); + added = false; + } + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + put_cpu_ptr(sess->cpu_queues); + + return added; +} + +static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev, + struct blk_mq_hw_ctx *hctx, + int delay) +{ + struct rnbd_queue *q = hctx->driver_data; + + if (delay != RNBD_DELAY_IFBUSY) + blk_mq_delay_run_hw_queue(hctx, delay); + else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q))) + /* + * If session is not busy we have to restart + * the queue ourselves. + */ + blk_mq_delay_run_hw_queue(hctx, 10/*ms*/); +} + +static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); + int err; + + if (unlikely(dev->dev_state != DEV_STATE_MAPPED)) + return BLK_STS_IOERR; + + iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON, + RTRS_PERMIT_NOWAIT); + if (unlikely(!iu->permit)) { + rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY); + return BLK_STS_RESOURCE; + } + + blk_mq_start_request(rq); + err = rnbd_client_xfer_request(dev, rq, iu); + if (likely(err == 0)) + return BLK_STS_OK; + if (unlikely(err == -EAGAIN || err == -ENOMEM)) { + rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/); + rnbd_put_permit(dev->sess, iu->permit); + return BLK_STS_RESOURCE; + } + + rnbd_put_permit(dev->sess, iu->permit); + return BLK_STS_IOERR; +} + +static int rnbd_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); + + sg_init_table(iu->sglist, BMAX_SEGMENTS); + return 0; +} + +static struct blk_mq_ops rnbd_mq_ops = { + .queue_rq = rnbd_queue_rq, + .init_request = rnbd_init_request, + .complete = rnbd_softirq_done_fn, +}; + +static int setup_mq_tags(struct rnbd_clt_session *sess) +{ + struct blk_mq_tag_set *tag_set = &sess->tag_set; + + memset(tag_set, 0, sizeof(*tag_set)); + tag_set->ops = &rnbd_mq_ops; + tag_set->queue_depth = sess->queue_depth; + tag_set->numa_node = NUMA_NO_NODE; + tag_set->flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_TAG_SHARED; + tag_set->cmd_size = sizeof(struct rnbd_iu); + tag_set->nr_hw_queues = num_online_cpus(); + + return blk_mq_alloc_tag_set(tag_set); +} + +static struct rnbd_clt_session * +find_and_get_or_create_sess(const char *sessname, + const struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr) +{ + struct rnbd_clt_session *sess; + struct rtrs_attrs attrs; + int err; + bool first; + struct rtrs_clt_ops rtrs_ops; + + sess = find_or_create_sess(sessname, &first); + if (sess == ERR_PTR(-ENOMEM)) + return ERR_PTR(-ENOMEM); + else if (!first) + return sess; + + rtrs_ops = (struct rtrs_clt_ops) { + .priv = sess, + .link_ev = rnbd_clt_link_ev, + }; + /* + * Nothing was found, establish rtrs connection and proceed further. + */ + sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname, + paths, path_cnt, port_nr, + sizeof(struct rnbd_iu), + RECONNECT_DELAY, BMAX_SEGMENTS, + BLK_MAX_SEGMENT_SIZE, + MAX_RECONNECTS); + if (IS_ERR(sess->rtrs)) { + err = PTR_ERR(sess->rtrs); + goto wake_up_and_put; + } + rtrs_clt_query(sess->rtrs, &attrs); + sess->max_io_size = attrs.max_io_size; + sess->queue_depth = attrs.queue_depth; + + err = setup_mq_tags(sess); + if (err) + goto close_rtrs; + + err = send_msg_sess_info(sess, WAIT); + if (err) + goto close_rtrs; + + wake_up_rtrs_waiters(sess); + + return sess; + +close_rtrs: + close_rtrs(sess); +put_sess: + rnbd_clt_put_sess(sess); + + return ERR_PTR(err); + +wake_up_and_put: + wake_up_rtrs_waiters(sess); + goto put_sess; +} + +static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev, + struct rnbd_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + INIT_LIST_HEAD(&q->requeue_list); + q->dev = dev; + q->hctx = hctx; +} + +static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) +{ + int i; + struct blk_mq_hw_ctx *hctx; + struct rnbd_queue *q; + + queue_for_each_hw_ctx(dev->queue, hctx, i) { + q = &dev->hw_queues[i]; + rnbd_init_hw_queue(dev, q, hctx); + hctx->driver_data = q; + } +} + +static int setup_mq_dev(struct rnbd_clt_dev *dev) +{ + dev->queue = blk_mq_init_queue(&dev->sess->tag_set); + if (IS_ERR(dev->queue)) { + rnbd_clt_err(dev, "Initializing multiqueue queue failed, err: %ld\n", + PTR_ERR(dev->queue)); + return PTR_ERR(dev->queue); + } + rnbd_init_mq_hw_queues(dev); + return 0; +} + +static void setup_request_queue(struct rnbd_clt_dev *dev) +{ + blk_queue_logical_block_size(dev->queue, dev->logical_block_size); + blk_queue_physical_block_size(dev->queue, dev->physical_block_size); + blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors); + blk_queue_max_write_same_sectors(dev->queue, + dev->max_write_same_sectors); + + /* + * we don't support discards to "discontiguous" segments + * in on request + */ + blk_queue_max_discard_segments(dev->queue, 1); + + blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); + dev->queue->limits.discard_granularity = dev->discard_granularity; + dev->queue->limits.discard_alignment = dev->discard_alignment; + if (dev->max_discard_sectors) + blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue); + if (dev->secure_discard) + blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue); + + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); + blk_queue_max_segments(dev->queue, dev->max_segments); + blk_queue_io_opt(dev->queue, dev->sess->max_io_size); + blk_queue_virt_boundary(dev->queue, SZ_4K - 1); + blk_queue_write_cache(dev->queue, true, true); + dev->queue->queuedata = dev; +} + +static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +{ + dev->gd->major = rnbd_client_major; + dev->gd->first_minor = idx << RNBD_PART_BITS; + dev->gd->fops = &rnbd_client_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", + idx); + pr_debug("disk_name=%s, capacity=%zu\n", + dev->gd->disk_name, + dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) + ); + + set_capacity(dev->gd, dev->nsectors); + + if (dev->access_mode == RNBD_ACCESS_RO) { + dev->read_only = true; + set_disk_ro(dev->gd, true); + } else { + dev->read_only = false; + } + + if (!dev->rotational) + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); +} + +static int rnbd_client_setup_device(struct rnbd_clt_session *sess, + struct rnbd_clt_dev *dev, int idx) +{ + int err; + + dev->size = dev->nsectors * dev->logical_block_size; + + err = setup_mq_dev(dev); + if (err) + return err; + + setup_request_queue(dev); + + dev->gd = alloc_disk_node(1 << RNBD_PART_BITS, NUMA_NO_NODE); + if (!dev->gd) { + blk_cleanup_queue(dev->queue); + return -ENOMEM; + } + + rnbd_clt_setup_gen_disk(dev, idx); + + return 0; +} + +static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, + enum rnbd_access_mode access_mode, + const char *pathname) +{ + struct rnbd_clt_dev *dev; + int ret; + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->hw_queues = kcalloc(nr_cpu_ids, sizeof(*dev->hw_queues), + GFP_KERNEL); + if (!dev->hw_queues) { + ret = -ENOMEM; + goto out_alloc; + } + + mutex_lock(&ida_lock); + ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS), + GFP_KERNEL); + mutex_unlock(&ida_lock); + if (ret < 0) { + pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", + pathname, sess->sessname, ret); + goto out_queues; + } + dev->clt_device_id = ret; + dev->sess = sess; + dev->access_mode = access_mode; + strlcpy(dev->pathname, pathname, sizeof(dev->pathname)); + mutex_init(&dev->lock); + refcount_set(&dev->refcount, 1); + dev->dev_state = DEV_STATE_INIT; + + /* + * Here we called from sysfs entry, thus clt-sysfs is + * responsible that session will not disappear. + */ + WARN_ON(!rnbd_clt_get_sess(sess)); + + return dev; + +out_queues: + kfree(dev->hw_queues); +out_alloc: + kfree(dev); + return ERR_PTR(ret); +} + +static bool __exists_dev(const char *pathname) +{ + struct rnbd_clt_session *sess; + struct rnbd_clt_dev *dev; + bool found = false; + + list_for_each_entry(sess, &sess_list, list) { + mutex_lock(&sess->lock); + list_for_each_entry(dev, &sess->devs_list, list) { + if (!strncmp(dev->pathname, pathname, + sizeof(dev->pathname))) { + found = true; + break; + } + } + mutex_unlock(&sess->lock); + if (found) + break; + } + + return found; +} + +static bool exists_devpath(const char *pathname) +{ + bool found; + + mutex_lock(&sess_lock); + found = __exists_dev(pathname); + mutex_unlock(&sess_lock); + + return found; +} + +static bool insert_dev_if_not_exists_devpath(const char *pathname, + struct rnbd_clt_session *sess, + struct rnbd_clt_dev *dev) +{ + bool found; + + mutex_lock(&sess_lock); + found = __exists_dev(pathname); + if (!found) { + mutex_lock(&sess->lock); + list_add_tail(&dev->list, &sess->devs_list); + mutex_unlock(&sess->lock); + } + mutex_unlock(&sess_lock); + + return found; +} + +static void delete_dev(struct rnbd_clt_dev *dev) +{ + struct rnbd_clt_session *sess = dev->sess; + + mutex_lock(&sess->lock); + list_del(&dev->list); + mutex_unlock(&sess->lock); +} + +struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, + struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr, + const char *pathname, + enum rnbd_access_mode access_mode) +{ + struct rnbd_clt_session *sess; + struct rnbd_clt_dev *dev; + int ret; + + if (exists_devpath(pathname)) + return ERR_PTR(-EEXIST); + + sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); + if (IS_ERR(sess)) + return ERR_CAST(sess); + + dev = init_dev(sess, access_mode, pathname); + if (IS_ERR(dev)) { + pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n", + pathname, sess->sessname, PTR_ERR(dev)); + ret = PTR_ERR(dev); + goto put_sess; + } + if (insert_dev_if_not_exists_devpath(pathname, sess, dev)) { + ret = -EEXIST; + goto put_dev; + } + ret = send_msg_open(dev, WAIT); + if (ret) { + rnbd_clt_err(dev, + "map_device: failed, can't open remote device, err: %d\n", + ret); + goto del_dev; + } + mutex_lock(&dev->lock); + pr_debug("Opened remote device: session=%s, path='%s'\n", + sess->sessname, pathname); + ret = rnbd_client_setup_device(sess, dev, dev->clt_device_id); + if (ret) { + rnbd_clt_err(dev, + "map_device: Failed to configure device, err: %d\n", + ret); + mutex_unlock(&dev->lock); + goto del_dev; + } + + rnbd_clt_info(dev, + "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d)\n", + dev->gd->disk_name, dev->nsectors, + dev->logical_block_size, dev->physical_block_size, + dev->max_write_same_sectors, dev->max_discard_sectors, + dev->discard_granularity, dev->discard_alignment, + dev->secure_discard, dev->max_segments, + dev->max_hw_sectors, dev->rotational); + + mutex_unlock(&dev->lock); + + add_disk(dev->gd); + rnbd_clt_put_sess(sess); + + return dev; + +del_dev: + delete_dev(dev); +put_dev: + rnbd_clt_put_dev(dev); +put_sess: + rnbd_clt_put_sess(sess); + + return ERR_PTR(ret); +} + +static void destroy_gen_disk(struct rnbd_clt_dev *dev) +{ + del_gendisk(dev->gd); + blk_cleanup_queue(dev->queue); + put_disk(dev->gd); +} + +static void destroy_sysfs(struct rnbd_clt_dev *dev, + const struct attribute *sysfs_self) +{ + rnbd_clt_remove_dev_symlink(dev); + if (dev->kobj.state_initialized) { + if (sysfs_self) + /* To avoid deadlock firstly remove itself */ + sysfs_remove_file_self(&dev->kobj, sysfs_self); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } +} + +int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, + const struct attribute *sysfs_self) +{ + struct rnbd_clt_session *sess = dev->sess; + int refcount, ret = 0; + bool was_mapped; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_UNMAPPED) { + rnbd_clt_info(dev, "Device is already being unmapped\n"); + ret = -EALREADY; + goto err; + } + refcount = refcount_read(&dev->refcount); + if (!force && refcount > 1) { + rnbd_clt_err(dev, + "Closing device failed, device is in use, (%d device users)\n", + refcount - 1); + ret = -EBUSY; + goto err; + } + was_mapped = (dev->dev_state == DEV_STATE_MAPPED); + dev->dev_state = DEV_STATE_UNMAPPED; + mutex_unlock(&dev->lock); + + delete_dev(dev); + destroy_sysfs(dev, sysfs_self); + destroy_gen_disk(dev); + if (was_mapped && sess->rtrs) + send_msg_close(dev, dev->device_id, WAIT); + + rnbd_clt_info(dev, "Device is unmapped\n"); + + /* Likely last reference put */ + rnbd_clt_put_dev(dev); + + /* + * Here device and session can be vanished! + */ + + return 0; +err: + mutex_unlock(&dev->lock); + + return ret; +} + +int rnbd_clt_remap_device(struct rnbd_clt_dev *dev) +{ + int err; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) + err = 0; + else if (dev->dev_state == DEV_STATE_UNMAPPED) + err = -ENODEV; + else if (dev->dev_state == DEV_STATE_MAPPED) + err = -EALREADY; + else + err = -EBUSY; + mutex_unlock(&dev->lock); + if (!err) { + rnbd_clt_info(dev, "Remapping device.\n"); + err = send_msg_open(dev, WAIT); + if (err) + rnbd_clt_err(dev, "remap_device: %d\n", err); + } + + return err; +} + +static void unmap_device_work(struct work_struct *work) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(work, typeof(*dev), unmap_on_rmmod_work); + rnbd_clt_unmap_device(dev, true, NULL); +} + +static void rnbd_destroy_sessions(void) +{ + struct rnbd_clt_session *sess, *sn; + struct rnbd_clt_dev *dev, *tn; + + /* Firstly forbid access through sysfs interface */ + rnbd_clt_destroy_default_group(); + rnbd_clt_destroy_sysfs_files(); + + /* + * Here at this point there is no any concurrent access to sessions + * list and devices list: + * 1. New session or device can'be be created - session sysfs files + * are removed. + * 2. Device or session can't be removed - module reference is taken + * into account in unmap device sysfs callback. + * 3. No IO requests inflight - each file open of block_dev increases + * module reference in get_disk(). + * + * But still there can be user requests inflights, which are sent by + * asynchronous send_msg_*() functions, thus before unmapping devices + * RTRS session must be explicitly closed. + */ + + list_for_each_entry_safe(sess, sn, &sess_list, list) { + WARN_ON(!rnbd_clt_get_sess(sess)); + close_rtrs(sess); + list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { + /* + * Here unmap happens in parallel for only one reason: + * blk_cleanup_queue() takes around half a second, so + * on huge amount of devices the whole module unload + * procedure takes minutes. + */ + INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); + queue_work(system_long_wq, &dev->unmap_on_rmmod_work); + } + rnbd_clt_put_sess(sess); + } + /* Wait for all scheduled unmap works */ + flush_workqueue(system_long_wq); + WARN_ON(!list_empty(&sess_list)); +} + +static int __init rnbd_client_init(void) +{ + int err = 0; + + BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264); + BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56); + rnbd_client_major = register_blkdev(rnbd_client_major, "rnbd"); + if (rnbd_client_major <= 0) { + pr_err("Failed to load module, block device registration failed\n"); + return -EBUSY; + } + + err = rnbd_clt_create_sysfs_files(); + if (err) { + pr_err("Failed to load module, creating sysfs device files failed, err: %d\n", + err); + unregister_blkdev(rnbd_client_major, "rnbd"); + } + + return err; +} + +static void __exit rnbd_client_exit(void) +{ + rnbd_destroy_sessions(); + unregister_blkdev(rnbd_client_major, "rnbd"); + ida_destroy(&index_ida); +} + +module_init(rnbd_client_init); +module_exit(rnbd_client_exit); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h new file mode 100644 index 000000000000..ed33654aa486 --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#ifndef RNBD_CLT_H +#define RNBD_CLT_H + +#include <linux/wait.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/blk-mq.h> +#include <linux/refcount.h> + +#include <rtrs.h> +#include "rnbd-proto.h" +#include "rnbd-log.h" + +/* Max. number of segments per IO request, Mellanox Connect X ~ Connect X5, + * choose minimial 30 for all, minus 1 for internal protocol, so 29. + */ +#define BMAX_SEGMENTS 29 +/* time in seconds between reconnect tries, default to 30 s */ +#define RECONNECT_DELAY 30 +/* + * Number of times to reconnect on error before giving up, 0 for * disabled, + * -1 for forever + */ +#define MAX_RECONNECTS -1 + +enum rnbd_clt_dev_state { + DEV_STATE_INIT, + DEV_STATE_MAPPED, + DEV_STATE_MAPPED_DISCONNECTED, + DEV_STATE_UNMAPPED, +}; + +struct rnbd_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +struct rnbd_iu { + union { + struct request *rq; /* for block io */ + void *buf; /* for user messages */ + }; + struct rtrs_permit *permit; + union { + /* use to send msg associated with a dev */ + struct rnbd_clt_dev *dev; + /* use to send msg associated with a sess */ + struct rnbd_clt_session *sess; + }; + struct scatterlist sglist[BMAX_SEGMENTS]; + struct work_struct work; + int errno; + struct rnbd_iu_comp comp; + atomic_t refcount; +}; + +struct rnbd_cpu_qlist { + struct list_head requeue_list; + spinlock_t requeue_lock; + unsigned int cpu; +}; + +struct rnbd_clt_session { + struct list_head list; + struct rtrs_clt *rtrs; + wait_queue_head_t rtrs_waitq; + bool rtrs_ready; + struct rnbd_cpu_qlist __percpu + *cpu_queues; + DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); + int __percpu *cpu_rr; /* per-cpu var for CPU round-robin */ + atomic_t busy; + int queue_depth; + u32 max_io_size; + struct blk_mq_tag_set tag_set; + struct mutex lock; /* protects state and devs_list */ + struct list_head devs_list; /* list of struct rnbd_clt_dev */ + refcount_t refcount; + char sessname[NAME_MAX]; + u8 ver; /* protocol version */ +}; + +/** + * Submission queues. + */ +struct rnbd_queue { + struct list_head requeue_list; + unsigned long in_list; + struct rnbd_clt_dev *dev; + struct blk_mq_hw_ctx *hctx; +}; + +struct rnbd_clt_dev { + struct rnbd_clt_session *sess; + struct request_queue *queue; + struct rnbd_queue *hw_queues; + u32 device_id; + /* local Idr index - used to track minor number allocations. */ + u32 clt_device_id; + struct mutex lock; + enum rnbd_clt_dev_state dev_state; + char pathname[NAME_MAX]; + enum rnbd_access_mode access_mode; + bool read_only; + bool rotational; + u32 max_hw_sectors; + u32 max_write_same_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 secure_discard; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + size_t nsectors; + u64 size; /* device size in bytes */ + struct list_head list; + struct gendisk *gd; + struct kobject kobj; + char blk_symlink_name[NAME_MAX]; + refcount_t refcount; + struct work_struct unmap_on_rmmod_work; +}; + +/* rnbd-clt.c */ + +struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, + struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr, + const char *pathname, + enum rnbd_access_mode access_mode); +int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, + const struct attribute *sysfs_self); + +int rnbd_clt_remap_device(struct rnbd_clt_dev *dev); +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); + +/* rnbd-clt-sysfs.c */ + +int rnbd_clt_create_sysfs_files(void); + +void rnbd_clt_destroy_sysfs_files(void); +void rnbd_clt_destroy_default_group(void); + +void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev); + +#endif /* RNBD_CLT_H */ diff --git a/drivers/block/rnbd/rnbd-common.c b/drivers/block/rnbd/rnbd-common.c new file mode 100644 index 000000000000..596c3f732403 --- /dev/null +++ b/drivers/block/rnbd/rnbd-common.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#include "rnbd-proto.h" + +const char *rnbd_access_mode_str(enum rnbd_access_mode mode) +{ + switch (mode) { + case RNBD_ACCESS_RO: + return "ro"; + case RNBD_ACCESS_RW: + return "rw"; + case RNBD_ACCESS_MIGRATION: + return "migration"; + default: + return "unknown"; + } +} diff --git a/drivers/block/rnbd/rnbd-log.h b/drivers/block/rnbd/rnbd-log.h new file mode 100644 index 000000000000..136e7d6c3451 --- /dev/null +++ b/drivers/block/rnbd/rnbd-log.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_LOG_H +#define RNBD_LOG_H + +#include "rnbd-clt.h" +#include "rnbd-srv.h" + +#define rnbd_clt_log(fn, dev, fmt, ...) ( \ + fn("<%s@%s> " fmt, (dev)->pathname, \ + (dev)->sess->sessname, \ + ##__VA_ARGS__)) +#define rnbd_srv_log(fn, dev, fmt, ...) ( \ + fn("<%s@%s>: " fmt, (dev)->pathname, \ + (dev)->sess->sessname, ##__VA_ARGS__)) + +#define rnbd_clt_err(dev, fmt, ...) \ + rnbd_clt_log(pr_err, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_err_rl(dev, fmt, ...) \ + rnbd_clt_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_info(dev, fmt, ...) \ + rnbd_clt_log(pr_info, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_info_rl(dev, fmt, ...) \ + rnbd_clt_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__) + +#define rnbd_srv_err(dev, fmt, ...) \ + rnbd_srv_log(pr_err, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_err_rl(dev, fmt, ...) \ + rnbd_srv_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_info(dev, fmt, ...) \ + rnbd_srv_log(pr_info, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_info_rl(dev, fmt, ...) \ + rnbd_srv_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__) + +#endif /* RNBD_LOG_H */ diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h new file mode 100644 index 000000000000..ca166241452c --- /dev/null +++ b/drivers/block/rnbd/rnbd-proto.h @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_PROTO_H +#define RNBD_PROTO_H + +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/limits.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <rdma/ib.h> + +#define RNBD_PROTO_VER_MAJOR 2 +#define RNBD_PROTO_VER_MINOR 0 + +/* The default port number the RTRS server is listening on. */ +#define RTRS_PORT 1234 + +/** + * enum rnbd_msg_types - RNBD message types + * @RNBD_MSG_SESS_INFO: initial session info from client to server + * @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client + * @RNBD_MSG_OPEN: open (map) device request + * @RNBD_MSG_OPEN_RSP: response to an @RNBD_MSG_OPEN + * @RNBD_MSG_IO: block IO request operation + * @RNBD_MSG_CLOSE: close (unmap) device request + */ +enum rnbd_msg_type { + RNBD_MSG_SESS_INFO, + RNBD_MSG_SESS_INFO_RSP, + RNBD_MSG_OPEN, + RNBD_MSG_OPEN_RSP, + RNBD_MSG_IO, + RNBD_MSG_CLOSE, +}; + +/** + * struct rnbd_msg_hdr - header of RNBD messages + * @type: Message type, valid values see: enum rnbd_msg_types + */ +struct rnbd_msg_hdr { + __le16 type; + __le16 __padding; +}; + +/** + * We allow to map RO many times and RW only once. We allow to map yet another + * time RW, if MIGRATION is provided (second RW export can be required for + * example for VM migration) + */ +enum rnbd_access_mode { + RNBD_ACCESS_RO, + RNBD_ACCESS_RW, + RNBD_ACCESS_MIGRATION, +}; + +/** + * struct rnbd_msg_sess_info - initial session info from client to server + * @hdr: message header + * @ver: RNBD protocol version + */ +struct rnbd_msg_sess_info { + struct rnbd_msg_hdr hdr; + u8 ver; + u8 reserved[31]; +}; + +/** + * struct rnbd_msg_sess_info_rsp - initial session info from server to client + * @hdr: message header + * @ver: RNBD protocol version + */ +struct rnbd_msg_sess_info_rsp { + struct rnbd_msg_hdr hdr; + u8 ver; + u8 reserved[31]; +}; + +/** + * struct rnbd_msg_open - request to open a remote device. + * @hdr: message header + * @access_mode: the mode to open remote device, valid values see: + * enum rnbd_access_mode + * @device_name: device path on remote side + */ +struct rnbd_msg_open { + struct rnbd_msg_hdr hdr; + u8 access_mode; + u8 resv1; + s8 dev_name[NAME_MAX]; + u8 reserved[3]; +}; + +/** + * struct rnbd_msg_close - request to close a remote device. + * @hdr: message header + * @device_id: device_id on server side to identify the device + */ +struct rnbd_msg_close { + struct rnbd_msg_hdr hdr; + __le32 device_id; +}; + +/** + * struct rnbd_msg_open_rsp - response message to RNBD_MSG_OPEN + * @hdr: message header + * @device_id: device_id on server side to identify the device + * @nsectors: number of sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * @max_write_same_sectors: max sectors for WRITE SAME in the 512b unit + * @max_discard_sectors: max. sectors that can be discarded at once in 512b + * unit. + * @discard_granularity: size of the internal discard allocation unit in bytes + * @discard_alignment: offset from internal allocation assignment in bytes + * @physical_block_size: physical block size device supports in bytes + * @logical_block_size: logical block size device supports in bytes + * @max_segments: max segments hardware support in one transfer + * @secure_discard: supports secure discard + * @rotation: is a rotational disc? + */ +struct rnbd_msg_open_rsp { + struct rnbd_msg_hdr hdr; + __le32 device_id; + __le64 nsectors; + __le32 max_hw_sectors; + __le32 max_write_same_sectors; + __le32 max_discard_sectors; + __le32 discard_granularity; + __le32 discard_alignment; + __le16 physical_block_size; + __le16 logical_block_size; + __le16 max_segments; + __le16 secure_discard; + u8 rotational; + u8 reserved[11]; +}; + +/** + * struct rnbd_msg_io - message for I/O read/write + * @hdr: message header + * @device_id: device_id on server side to find the right device + * @sector: bi_sector attribute from struct bio + * @rw: valid values are defined in enum rnbd_io_flags + * @bi_size: number of bytes for I/O read/write + * @prio: priority + */ +struct rnbd_msg_io { + struct rnbd_msg_hdr hdr; + __le32 device_id; + __le64 sector; + __le32 rw; + __le32 bi_size; + __le16 prio; +}; + +#define RNBD_OP_BITS 8 +#define RNBD_OP_MASK ((1 << RNBD_OP_BITS) - 1) + +/** + * enum rnbd_io_flags - RNBD request types from rq_flag_bits + * @RNBD_OP_READ: read sectors from the device + * @RNBD_OP_WRITE: write sectors to the device + * @RNBD_OP_FLUSH: flush the volatile write cache + * @RNBD_OP_DISCARD: discard sectors + * @RNBD_OP_SECURE_ERASE: securely erase sectors + * @RNBD_OP_WRITE_SAME: write the same sectors many times + + * @RNBD_F_SYNC: request is sync (sync write or read) + * @RNBD_F_FUA: forced unit access + */ +enum rnbd_io_flags { + + /* Operations */ + + RNBD_OP_READ = 0, + RNBD_OP_WRITE = 1, + RNBD_OP_FLUSH = 2, + RNBD_OP_DISCARD = 3, + RNBD_OP_SECURE_ERASE = 4, + RNBD_OP_WRITE_SAME = 5, + + RNBD_OP_LAST, + + /* Flags */ + + RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), + RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), + + RNBD_F_ALL = (RNBD_F_SYNC | RNBD_F_FUA) + +}; + +static inline u32 rnbd_op(u32 flags) +{ + return flags & RNBD_OP_MASK; +} + +static inline u32 rnbd_flags(u32 flags) +{ + return flags & ~RNBD_OP_MASK; +} + +static inline bool rnbd_flags_supported(u32 flags) +{ + u32 op; + + op = rnbd_op(flags); + flags = rnbd_flags(flags); + + if (op >= RNBD_OP_LAST) + return false; + if (flags & ~RNBD_F_ALL) + return false; + + return true; +} + +static inline u32 rnbd_to_bio_flags(u32 rnbd_opf) +{ + u32 bio_opf; + + switch (rnbd_op(rnbd_opf)) { + case RNBD_OP_READ: + bio_opf = REQ_OP_READ; + break; + case RNBD_OP_WRITE: + bio_opf = REQ_OP_WRITE; + break; + case RNBD_OP_FLUSH: + bio_opf = REQ_OP_FLUSH | REQ_PREFLUSH; + break; + case RNBD_OP_DISCARD: + bio_opf = REQ_OP_DISCARD; + break; + case RNBD_OP_SECURE_ERASE: + bio_opf = REQ_OP_SECURE_ERASE; + break; + case RNBD_OP_WRITE_SAME: + bio_opf = REQ_OP_WRITE_SAME; + break; + default: + WARN(1, "Unknown RNBD type: %d (flags %d)\n", + rnbd_op(rnbd_opf), rnbd_opf); + bio_opf = 0; + } + + if (rnbd_opf & RNBD_F_SYNC) + bio_opf |= REQ_SYNC; + + if (rnbd_opf & RNBD_F_FUA) + bio_opf |= REQ_FUA; + + return bio_opf; +} + +static inline u32 rq_to_rnbd_flags(struct request *rq) +{ + u32 rnbd_opf; + + switch (req_op(rq)) { + case REQ_OP_READ: + rnbd_opf = RNBD_OP_READ; + break; + case REQ_OP_WRITE: + rnbd_opf = RNBD_OP_WRITE; + break; + case REQ_OP_DISCARD: + rnbd_opf = RNBD_OP_DISCARD; + break; + case REQ_OP_SECURE_ERASE: + rnbd_opf = RNBD_OP_SECURE_ERASE; + break; + case REQ_OP_WRITE_SAME: + rnbd_opf = RNBD_OP_WRITE_SAME; + break; + case REQ_OP_FLUSH: + rnbd_opf = RNBD_OP_FLUSH; + break; + default: + WARN(1, "Unknown request type %d (flags %llu)\n", + req_op(rq), (unsigned long long)rq->cmd_flags); + rnbd_opf = 0; + } + + if (op_is_sync(rq->cmd_flags)) + rnbd_opf |= RNBD_F_SYNC; + + if (op_is_flush(rq->cmd_flags)) + rnbd_opf |= RNBD_F_FUA; + + return rnbd_opf; +} + +const char *rnbd_access_mode_str(enum rnbd_access_mode mode); + +#endif /* RNBD_PROTO_H */ diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c new file mode 100644 index 000000000000..5eddfd29ab64 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-dev.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rnbd-srv-dev.h" +#include "rnbd-log.h" + +struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, + struct bio_set *bs) +{ + struct rnbd_dev *dev; + int ret; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->blk_open_flags = flags; + dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); + ret = PTR_ERR_OR_ZERO(dev->bdev); + if (ret) + goto err; + + dev->blk_open_flags = flags; + bdevname(dev->bdev, dev->name); + dev->ibd_bio_set = bs; + + return dev; + +err: + kfree(dev); + return ERR_PTR(ret); +} + +void rnbd_dev_close(struct rnbd_dev *dev) +{ + blkdev_put(dev->bdev, dev->blk_open_flags); + kfree(dev); +} + +static void rnbd_dev_bi_end_io(struct bio *bio) +{ + struct rnbd_dev_blk_io *io = bio->bi_private; + + rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status)); + bio_put(bio); +} + +/** + * rnbd_bio_map_kern - map kernel address into bio + * @data: pointer to buffer to map + * @bs: bio_set to use. + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_pages, bs); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (bio_add_page(bio, virt_to_page(data), bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_put; + return bio; +} + +int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, + size_t len, u32 bi_size, enum rnbd_io_flags flags, + short prio, void *priv) +{ + struct rnbd_dev_blk_io *io; + struct bio *bio; + + /* Generate bio with pages pointing to the rdma buffer */ + bio = rnbd_bio_map_kern(data, dev->ibd_bio_set, len, GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + io = container_of(bio, struct rnbd_dev_blk_io, bio); + + io->dev = dev; + io->priv = priv; + + bio->bi_end_io = rnbd_dev_bi_end_io; + bio->bi_private = io; + bio->bi_opf = rnbd_to_bio_flags(flags); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = bi_size; + bio_set_prio(bio, prio); + bio_set_dev(bio, dev->bdev); + + submit_bio(bio); + + return 0; +} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h new file mode 100644 index 000000000000..0f65b09a270e --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_SRV_DEV_H +#define RNBD_SRV_DEV_H + +#include <linux/fs.h> +#include "rnbd-proto.h" + +struct rnbd_dev { + struct block_device *bdev; + struct bio_set *ibd_bio_set; + fmode_t blk_open_flags; + char name[BDEVNAME_SIZE]; +}; + +struct rnbd_dev_blk_io { + struct rnbd_dev *dev; + void *priv; + /* have to be last member for front_pad usage of bioset_init */ + struct bio bio; +}; + +/** + * rnbd_dev_open() - Open a device + * @flags: open flags + * @bs: bio_set to use during block io, + */ +struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, + struct bio_set *bs); + +/** + * rnbd_dev_close() - Close a device + */ +void rnbd_dev_close(struct rnbd_dev *dev); + +void rnbd_endio(void *priv, int error); + +static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) +{ + return queue_max_segments(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) +{ + return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) +{ + return blk_queue_secure_erase(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) +{ + if (!blk_queue_discard(bdev_get_queue(dev->bdev))) + return 0; + + return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev), + REQ_OP_DISCARD); +} + +static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) +{ + return bdev_get_queue(dev->bdev)->limits.discard_granularity; +} + +static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) +{ + return bdev_get_queue(dev->bdev)->limits.discard_alignment; +} + +/** + * rnbd_dev_submit_io() - Submit an I/O to the disk + * @dev: device to that the I/O is submitted + * @sector: address to read/write data to + * @data: I/O data to write or buffer to read I/O date into + * @len: length of @data + * @bi_size: Amount of data that will be read/written + * @prio: IO priority + * @priv: private data passed to @io_fn + */ +int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, + size_t len, u32 bi_size, enum rnbd_io_flags flags, + short prio, void *priv); + +#endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c new file mode 100644 index 000000000000..106775c074d1 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include <uapi/linux/limits.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/stat.h> +#include <linux/genhd.h> +#include <linux/list.h> +#include <linux/moduleparam.h> +#include <linux/device.h> + +#include "rnbd-srv.h" + +static struct device *rnbd_dev; +static struct class *rnbd_dev_class; +static struct kobject *rnbd_devs_kobj; + +static void rnbd_srv_dev_release(struct kobject *kobj) +{ + struct rnbd_srv_dev *dev; + + dev = container_of(kobj, struct rnbd_srv_dev, dev_kobj); + + kfree(dev); +} + +static struct kobj_type dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rnbd_srv_dev_release +}; + +int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, + struct block_device *bdev, + const char *dev_name) +{ + struct kobject *bdev_kobj; + int ret; + + ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, + rnbd_devs_kobj, dev_name); + if (ret) + return ret; + + dev->dev_sessions_kobj = kobject_create_and_add("sessions", + &dev->dev_kobj); + if (!dev->dev_sessions_kobj) + goto put_dev_kobj; + + bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj; + ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev"); + if (ret) + goto put_sess_kobj; + + return 0; + +put_sess_kobj: + kobject_put(dev->dev_sessions_kobj); +put_dev_kobj: + kobject_put(&dev->dev_kobj); + return ret; +} + +void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev) +{ + sysfs_remove_link(&dev->dev_kobj, "block_dev"); + kobject_del(dev->dev_sessions_kobj); + kobject_put(dev->dev_sessions_kobj); + kobject_del(&dev->dev_kobj); + kobject_put(&dev->dev_kobj); +} + +static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%d\n", + !(sess_dev->open_flags & FMODE_WRITE)); +} + +static struct kobj_attribute rnbd_srv_dev_session_ro_attr = + __ATTR_RO(read_only); + +static ssize_t access_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", + rnbd_access_mode_str(sess_dev->access_mode)); +} + +static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = + __ATTR_RO(access_mode); + +static ssize_t mapping_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname); +} + +static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = + __ATTR_RO(mapping_path); + +static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = { + &rnbd_srv_dev_session_access_mode_attr.attr, + &rnbd_srv_dev_session_ro_attr.attr, + &rnbd_srv_dev_session_mapping_path_attr.attr, + NULL, +}; + +static struct attribute_group rnbd_srv_default_dev_session_attr_group = { + .attrs = rnbd_srv_default_dev_sessions_attrs, +}; + +void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) +{ + sysfs_remove_group(&sess_dev->kobj, + &rnbd_srv_default_dev_session_attr_group); + + kobject_del(&sess_dev->kobj); + kobject_put(&sess_dev->kobj); +} + +static void rnbd_srv_sess_dev_release(struct kobject *kobj) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + rnbd_destroy_sess_dev(sess_dev); +} + +static struct kobj_type rnbd_srv_sess_dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rnbd_srv_sess_dev_release, +}; + +int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) +{ + int ret; + + ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype, + sess_dev->dev->dev_sessions_kobj, "%s", + sess_dev->sess->sessname); + if (ret) + return ret; + + ret = sysfs_create_group(&sess_dev->kobj, + &rnbd_srv_default_dev_session_attr_group); + if (ret) + goto err; + + return 0; + +err: + kobject_put(&sess_dev->kobj); + + return ret; +} + +int rnbd_srv_create_sysfs_files(void) +{ + int err; + + rnbd_dev_class = class_create(THIS_MODULE, "rnbd-server"); + if (IS_ERR(rnbd_dev_class)) + return PTR_ERR(rnbd_dev_class); + + rnbd_dev = device_create(rnbd_dev_class, NULL, + MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(rnbd_dev)) { + err = PTR_ERR(rnbd_dev); + goto cls_destroy; + } + rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj); + if (!rnbd_devs_kobj) { + err = -ENOMEM; + goto dev_destroy; + } + + return 0; + +dev_destroy: + device_destroy(rnbd_dev_class, MKDEV(0, 0)); +cls_destroy: + class_destroy(rnbd_dev_class); + + return err; +} + +void rnbd_srv_destroy_sysfs_files(void) +{ + kobject_del(rnbd_devs_kobj); + kobject_put(rnbd_devs_kobj); + device_destroy(rnbd_dev_class, MKDEV(0, 0)); + class_destroy(rnbd_dev_class); +} diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c new file mode 100644 index 000000000000..86e61523907b --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include <linux/module.h> +#include <linux/blkdev.h> + +#include "rnbd-srv.h" +#include "rnbd-srv-dev.h" + +MODULE_DESCRIPTION("RDMA Network Block Device Server"); +MODULE_LICENSE("GPL"); + +static u16 port_nr = RTRS_PORT; + +module_param_named(port_nr, port_nr, ushort, 0444); +MODULE_PARM_DESC(port_nr, + "The port number the server is listening on (default: " + __stringify(RTRS_PORT)")"); + +#define DEFAULT_DEV_SEARCH_PATH "/" + +static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH; + +static int dev_search_path_set(const char *val, const struct kernel_param *kp) +{ + const char *p = strrchr(val, '\n') ? : val + strlen(val); + + if (strlen(val) >= sizeof(dev_search_path)) + return -EINVAL; + + snprintf(dev_search_path, sizeof(dev_search_path), "%.*s", + (int)(p - val), val); + + pr_info("dev_search_path changed to '%s'\n", dev_search_path); + + return 0; +} + +static struct kparam_string dev_search_path_kparam_str = { + .maxlen = sizeof(dev_search_path), + .string = dev_search_path +}; + +static const struct kernel_param_ops dev_search_path_ops = { + .set = dev_search_path_set, + .get = param_get_string, +}; + +module_param_cb(dev_search_path, &dev_search_path_ops, + &dev_search_path_kparam_str, 0444); +MODULE_PARM_DESC(dev_search_path, + "Sets the dev_search_path. When a device is mapped this path is prepended to the device path from the map device operation. If %SESSNAME% is specified in a path, then device will be searched in a session namespace. (default: " + DEFAULT_DEV_SEARCH_PATH ")"); + +static DEFINE_MUTEX(sess_lock); +static DEFINE_SPINLOCK(dev_lock); + +static LIST_HEAD(sess_list); +static LIST_HEAD(dev_list); + +struct rnbd_io_private { + struct rtrs_srv_op *id; + struct rnbd_srv_sess_dev *sess_dev; +}; + +static void rnbd_sess_dev_release(struct kref *kref) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kref, struct rnbd_srv_sess_dev, kref); + complete(sess_dev->destroy_comp); +} + +static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +{ + kref_put(&sess_dev->kref, rnbd_sess_dev_release); +} + +void rnbd_endio(void *priv, int error) +{ + struct rnbd_io_private *rnbd_priv = priv; + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; + + rnbd_put_sess_dev(sess_dev); + + rtrs_srv_resp_rdma(rnbd_priv->id, error); + + kfree(priv); +} + +static struct rnbd_srv_sess_dev * +rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev; + int ret = 0; + + rcu_read_lock(); + sess_dev = xa_load(&srv_sess->index_idr, dev_id); + if (likely(sess_dev)) + ret = kref_get_unless_zero(&sess_dev->kref); + rcu_read_unlock(); + + if (!sess_dev || !ret) + return ERR_PTR(-ENXIO); + + return sess_dev; +} + +static int process_rdma(struct rtrs_srv *sess, + struct rnbd_srv_session *srv_sess, + struct rtrs_srv_op *id, void *data, u32 datalen, + const void *usr, size_t usrlen) +{ + const struct rnbd_msg_io *msg = usr; + struct rnbd_io_private *priv; + struct rnbd_srv_sess_dev *sess_dev; + u32 dev_id; + int err; + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + dev_id = le32_to_cpu(msg->device_id); + + sess_dev = rnbd_get_sess_dev(dev_id, srv_sess); + if (IS_ERR(sess_dev)) { + pr_err_ratelimited("Got I/O request on session %s for unknown device id %d\n", + srv_sess->sessname, dev_id); + err = -ENOTCONN; + goto err; + } + + priv->sess_dev = sess_dev; + priv->id = id; + + err = rnbd_dev_submit_io(sess_dev->rnbd_dev, le64_to_cpu(msg->sector), + data, datalen, le32_to_cpu(msg->bi_size), + le32_to_cpu(msg->rw), + srv_sess->ver < RNBD_PROTO_VER_MAJOR || + usrlen < sizeof(*msg) ? + 0 : le16_to_cpu(msg->prio), priv); + if (unlikely(err)) { + rnbd_srv_err(sess_dev, "Submitting I/O to device failed, err: %d\n", + err); + goto sess_dev_put; + } + + return 0; + +sess_dev_put: + rnbd_put_sess_dev(sess_dev); +err: + kfree(priv); + return err; +} + +static void destroy_device(struct rnbd_srv_dev *dev) +{ + WARN_ONCE(!list_empty(&dev->sess_dev_list), + "Device %s is being destroyed but still in use!\n", + dev->id); + + spin_lock(&dev_lock); + list_del(&dev->list); + spin_unlock(&dev_lock); + + mutex_destroy(&dev->lock); + if (dev->dev_kobj.state_in_sysfs) + /* + * Destroy kobj only if it was really created. + */ + rnbd_srv_destroy_dev_sysfs(dev); + else + kfree(dev); +} + +static void destroy_device_cb(struct kref *kref) +{ + struct rnbd_srv_dev *dev; + + dev = container_of(kref, struct rnbd_srv_dev, kref); + + destroy_device(dev); +} + +static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) +{ + kref_put(&dev->kref, destroy_device_cb); +} + +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +{ + DECLARE_COMPLETION_ONSTACK(dc); + + xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); + synchronize_rcu(); + sess_dev->destroy_comp = &dc; + rnbd_put_sess_dev(sess_dev); + wait_for_completion(&dc); /* wait for inflights to drop to zero */ + + rnbd_dev_close(sess_dev->rnbd_dev); + list_del(&sess_dev->sess_list); + mutex_lock(&sess_dev->dev->lock); + list_del(&sess_dev->dev_list); + if (sess_dev->open_flags & FMODE_WRITE) + sess_dev->dev->open_write_cnt--; + mutex_unlock(&sess_dev->dev->lock); + + rnbd_put_srv_dev(sess_dev->dev); + + rnbd_srv_info(sess_dev, "Device closed\n"); + kfree(sess_dev); +} + +static void destroy_sess(struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev, *tmp; + + if (list_empty(&srv_sess->sess_dev_list)) + goto out; + + mutex_lock(&srv_sess->lock); + list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list, + sess_list) + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&srv_sess->lock); + +out: + xa_destroy(&srv_sess->index_idr); + bioset_exit(&srv_sess->sess_bio_set); + + pr_info("RTRS Session %s disconnected\n", srv_sess->sessname); + + mutex_lock(&sess_lock); + list_del(&srv_sess->list); + mutex_unlock(&sess_lock); + + mutex_destroy(&srv_sess->lock); + kfree(srv_sess); +} + +static int create_sess(struct rtrs_srv *rtrs) +{ + struct rnbd_srv_session *srv_sess; + char sessname[NAME_MAX]; + int err; + + err = rtrs_srv_get_sess_name(rtrs, sessname, sizeof(sessname)); + if (err) { + pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err); + + return err; + } + srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL); + if (!srv_sess) + return -ENOMEM; + + srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs); + err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth, + offsetof(struct rnbd_dev_blk_io, bio), + BIOSET_NEED_BVECS); + if (err) { + pr_err("Allocating srv_session for session %s failed\n", + sessname); + kfree(srv_sess); + return err; + } + + xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC); + INIT_LIST_HEAD(&srv_sess->sess_dev_list); + mutex_init(&srv_sess->lock); + mutex_lock(&sess_lock); + list_add(&srv_sess->list, &sess_list); + mutex_unlock(&sess_lock); + + srv_sess->rtrs = rtrs; + strlcpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname)); + + rtrs_srv_set_sess_priv(rtrs, srv_sess); + + return 0; +} + +static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, + enum rtrs_srv_link_ev ev, void *priv) +{ + struct rnbd_srv_session *srv_sess = priv; + + switch (ev) { + case RTRS_SRV_LINK_EV_CONNECTED: + return create_sess(rtrs); + + case RTRS_SRV_LINK_EV_DISCONNECTED: + if (WARN_ON_ONCE(!srv_sess)) + return -EINVAL; + + destroy_sess(srv_sess); + return 0; + + default: + pr_warn("Received unknown RTRS session event %d from session %s\n", + ev, srv_sess->sessname); + return -EINVAL; + } +} + +static int process_msg_close(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + void *data, size_t datalen, const void *usr, + size_t usrlen) +{ + const struct rnbd_msg_close *close_msg = usr; + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), + srv_sess); + if (IS_ERR(sess_dev)) + return 0; + + rnbd_put_sess_dev(sess_dev); + mutex_lock(&srv_sess->lock); + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&srv_sess->lock); + return 0; +} + +static int process_msg_open(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen); + +static int process_msg_sess_info(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen); + +static int rnbd_srv_rdma_ev(struct rtrs_srv *rtrs, void *priv, + struct rtrs_srv_op *id, int dir, + void *data, size_t datalen, const void *usr, + size_t usrlen) +{ + struct rnbd_srv_session *srv_sess = priv; + const struct rnbd_msg_hdr *hdr = usr; + int ret = 0; + u16 type; + + if (WARN_ON_ONCE(!srv_sess)) + return -ENODEV; + + type = le16_to_cpu(hdr->type); + + switch (type) { + case RNBD_MSG_IO: + return process_rdma(rtrs, srv_sess, id, data, datalen, usr, + usrlen); + case RNBD_MSG_CLOSE: + ret = process_msg_close(rtrs, srv_sess, data, datalen, + usr, usrlen); + break; + case RNBD_MSG_OPEN: + ret = process_msg_open(rtrs, srv_sess, usr, usrlen, + data, datalen); + break; + case RNBD_MSG_SESS_INFO: + ret = process_msg_sess_info(rtrs, srv_sess, usr, usrlen, + data, datalen); + break; + default: + pr_warn("Received unexpected message type %d with dir %d from session %s\n", + type, dir, srv_sess->sessname); + return -EINVAL; + } + + rtrs_srv_resp_rdma(id, ret); + return 0; +} + +static struct rnbd_srv_sess_dev +*rnbd_sess_dev_alloc(struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev; + int error; + + sess_dev = kzalloc(sizeof(*sess_dev), GFP_KERNEL); + if (!sess_dev) + return ERR_PTR(-ENOMEM); + + error = xa_alloc(&srv_sess->index_idr, &sess_dev->device_id, sess_dev, + xa_limit_32b, GFP_NOWAIT); + if (error < 0) { + pr_warn("Allocating idr failed, err: %d\n", error); + kfree(sess_dev); + return ERR_PTR(error); + } + + return sess_dev; +} + +static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) +{ + struct rnbd_srv_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + strlcpy(dev->id, id, sizeof(dev->id)); + kref_init(&dev->kref); + INIT_LIST_HEAD(&dev->sess_dev_list); + mutex_init(&dev->lock); + + return dev; +} + +static struct rnbd_srv_dev * +rnbd_srv_find_or_add_srv_dev(struct rnbd_srv_dev *new_dev) +{ + struct rnbd_srv_dev *dev; + + spin_lock(&dev_lock); + list_for_each_entry(dev, &dev_list, list) { + if (!strncmp(dev->id, new_dev->id, sizeof(dev->id))) { + if (!kref_get_unless_zero(&dev->kref)) + /* + * We lost the race, device is almost dead. + * Continue traversing to find a valid one. + */ + continue; + spin_unlock(&dev_lock); + return dev; + } + } + list_add(&new_dev->list, &dev_list); + spin_unlock(&dev_lock); + + return new_dev; +} + +static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, + struct rnbd_srv_session *srv_sess, + enum rnbd_access_mode access_mode) +{ + int ret = -EPERM; + + mutex_lock(&srv_dev->lock); + + switch (access_mode) { + case RNBD_ACCESS_RO: + ret = 0; + break; + case RNBD_ACCESS_RW: + if (srv_dev->open_write_cnt == 0) { + srv_dev->open_write_cnt++; + ret = 0; + } else { + pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", + srv_dev->id, srv_sess->sessname, + srv_dev->open_write_cnt, + rnbd_access_mode_str(access_mode)); + } + break; + case RNBD_ACCESS_MIGRATION: + if (srv_dev->open_write_cnt < 2) { + srv_dev->open_write_cnt++; + ret = 0; + } else { + pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", + srv_dev->id, srv_sess->sessname, + srv_dev->open_write_cnt, + rnbd_access_mode_str(access_mode)); + } + break; + default: + pr_err("Received mapping request for device '%s' on session %s with invalid access mode: %d\n", + srv_dev->id, srv_sess->sessname, access_mode); + ret = -EINVAL; + } + + mutex_unlock(&srv_dev->lock); + + return ret; +} + +static struct rnbd_srv_dev * +rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, + struct rnbd_srv_session *srv_sess, + enum rnbd_access_mode access_mode) +{ + int ret; + struct rnbd_srv_dev *new_dev, *dev; + + new_dev = rnbd_srv_init_srv_dev(rnbd_dev->name); + if (IS_ERR(new_dev)) + return new_dev; + + dev = rnbd_srv_find_or_add_srv_dev(new_dev); + if (dev != new_dev) + kfree(new_dev); + + ret = rnbd_srv_check_update_open_perm(dev, srv_sess, access_mode); + if (ret) { + rnbd_put_srv_dev(dev); + return ERR_PTR(ret); + } + + return dev; +} + +static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, + struct rnbd_srv_sess_dev *sess_dev) +{ + struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; + + rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); + rsp->device_id = + cpu_to_le32(sess_dev->device_id); + rsp->nsectors = + cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); + rsp->logical_block_size = + cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); + rsp->physical_block_size = + cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); + rsp->max_segments = + cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); + rsp->max_hw_sectors = + cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); + rsp->max_write_same_sectors = + cpu_to_le32(bdev_write_same(rnbd_dev->bdev)); + rsp->max_discard_sectors = + cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); + rsp->discard_granularity = + cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); + rsp->discard_alignment = + cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); + rsp->secure_discard = + cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); + rsp->rotational = + !blk_queue_nonrot(bdev_get_queue(rnbd_dev->bdev)); +} + +static struct rnbd_srv_sess_dev * +rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, + const struct rnbd_msg_open *open_msg, + struct rnbd_dev *rnbd_dev, fmode_t open_flags, + struct rnbd_srv_dev *srv_dev) +{ + struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); + + if (IS_ERR(sdev)) + return sdev; + + kref_init(&sdev->kref); + + strlcpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); + + sdev->rnbd_dev = rnbd_dev; + sdev->sess = srv_sess; + sdev->dev = srv_dev; + sdev->open_flags = open_flags; + sdev->access_mode = open_msg->access_mode; + + return sdev; +} + +static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess, + const char *dev_name) +{ + char *full_path; + char *a, *b; + + full_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!full_path) + return ERR_PTR(-ENOMEM); + + /* + * Replace %SESSNAME% with a real session name in order to + * create device namespace. + */ + a = strnstr(dev_search_path, "%SESSNAME%", sizeof(dev_search_path)); + if (a) { + int len = a - dev_search_path; + + len = snprintf(full_path, PATH_MAX, "%.*s/%s/%s", len, + dev_search_path, srv_sess->sessname, dev_name); + if (len >= PATH_MAX) { + pr_err("Too long path: %s, %s, %s\n", + dev_search_path, srv_sess->sessname, dev_name); + kfree(full_path); + return ERR_PTR(-EINVAL); + } + } else { + snprintf(full_path, PATH_MAX, "%s/%s", + dev_search_path, dev_name); + } + + /* eliminitate duplicated slashes */ + a = strchr(full_path, '/'); + b = a; + while (*b != '\0') { + if (*b == '/' && *a == '/') { + b++; + } else { + a++; + *a = *b; + b++; + } + } + a++; + *a = '\0'; + + return full_path; +} + +static int process_msg_sess_info(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen) +{ + const struct rnbd_msg_sess_info *sess_info_msg = msg; + struct rnbd_msg_sess_info_rsp *rsp = data; + + srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", + srv_sess->sessname, srv_sess->ver, + sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + + rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); + rsp->ver = srv_sess->ver; + + return 0; +} + +/** + * find_srv_sess_dev() - a dev is already opened by this name + * @srv_sess: the session to search. + * @dev_name: string containing the name of the device. + * + * Return struct rnbd_srv_sess_dev if srv_sess already opened the dev_name + * NULL if the session didn't open the device yet. + */ +static struct rnbd_srv_sess_dev * +find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) +{ + struct rnbd_srv_sess_dev *sess_dev; + + if (list_empty(&srv_sess->sess_dev_list)) + return NULL; + + list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list) + if (!strcmp(sess_dev->pathname, dev_name)) + return sess_dev; + + return NULL; +} + +static int process_msg_open(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen) +{ + int ret; + struct rnbd_srv_dev *srv_dev; + struct rnbd_srv_sess_dev *srv_sess_dev; + const struct rnbd_msg_open *open_msg = msg; + fmode_t open_flags; + char *full_path; + struct rnbd_dev *rnbd_dev; + struct rnbd_msg_open_rsp *rsp = data; + + pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", + srv_sess->sessname, open_msg->dev_name, + open_msg->access_mode); + open_flags = FMODE_READ; + if (open_msg->access_mode != RNBD_ACCESS_RO) + open_flags |= FMODE_WRITE; + + mutex_lock(&srv_sess->lock); + + srv_sess_dev = find_srv_sess_dev(srv_sess, open_msg->dev_name); + if (srv_sess_dev) + goto fill_response; + + if ((strlen(dev_search_path) + strlen(open_msg->dev_name)) + >= PATH_MAX) { + pr_err("Opening device for session %s failed, device path too long. '%s/%s' is longer than PATH_MAX (%d)\n", + srv_sess->sessname, dev_search_path, open_msg->dev_name, + PATH_MAX); + ret = -EINVAL; + goto reject; + } + if (strstr(open_msg->dev_name, "..")) { + pr_err("Opening device for session %s failed, device path %s contains relative path ..\n", + srv_sess->sessname, open_msg->dev_name); + ret = -EINVAL; + goto reject; + } + full_path = rnbd_srv_get_full_path(srv_sess, open_msg->dev_name); + if (IS_ERR(full_path)) { + ret = PTR_ERR(full_path); + pr_err("Opening device '%s' for client %s failed, failed to get device full path, err: %d\n", + open_msg->dev_name, srv_sess->sessname, ret); + goto reject; + } + + rnbd_dev = rnbd_dev_open(full_path, open_flags, + &srv_sess->sess_bio_set); + if (IS_ERR(rnbd_dev)) { + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); + ret = PTR_ERR(rnbd_dev); + goto free_path; + } + + srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, + open_msg->access_mode); + if (IS_ERR(srv_dev)) { + pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(srv_dev)); + ret = PTR_ERR(srv_dev); + goto rnbd_dev_close; + } + + srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, + rnbd_dev, open_flags, + srv_dev); + if (IS_ERR(srv_sess_dev)) { + pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev)); + ret = PTR_ERR(srv_sess_dev); + goto srv_dev_put; + } + + /* Create the srv_dev sysfs files if they haven't been created yet. The + * reason to delay the creation is not to create the sysfs files before + * we are sure the device can be opened. + */ + mutex_lock(&srv_dev->lock); + if (!srv_dev->dev_kobj.state_in_sysfs) { + ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev, + rnbd_dev->name); + if (ret) { + mutex_unlock(&srv_dev->lock); + rnbd_srv_err(srv_sess_dev, + "Opening device failed, failed to create device sysfs files, err: %d\n", + ret); + goto free_srv_sess_dev; + } + } + + ret = rnbd_srv_create_dev_session_sysfs(srv_sess_dev); + if (ret) { + mutex_unlock(&srv_dev->lock); + rnbd_srv_err(srv_sess_dev, + "Opening device failed, failed to create dev client sysfs files, err: %d\n", + ret); + goto free_srv_sess_dev; + } + + list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); + mutex_unlock(&srv_dev->lock); + + list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list); + + rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); + + kfree(full_path); + +fill_response: + rnbd_srv_fill_msg_open_rsp(rsp, srv_sess_dev); + mutex_unlock(&srv_sess->lock); + return 0; + +free_srv_sess_dev: + xa_erase(&srv_sess->index_idr, srv_sess_dev->device_id); + synchronize_rcu(); + kfree(srv_sess_dev); +srv_dev_put: + if (open_msg->access_mode != RNBD_ACCESS_RO) { + mutex_lock(&srv_dev->lock); + srv_dev->open_write_cnt--; + mutex_unlock(&srv_dev->lock); + } + rnbd_put_srv_dev(srv_dev); +rnbd_dev_close: + rnbd_dev_close(rnbd_dev); +free_path: + kfree(full_path); +reject: + mutex_unlock(&srv_sess->lock); + return ret; +} + +static struct rtrs_srv_ctx *rtrs_ctx; + +static struct rtrs_srv_ops rtrs_ops; +static int __init rnbd_srv_init_module(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264); + BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56); + rtrs_ops = (struct rtrs_srv_ops) { + .rdma_ev = rnbd_srv_rdma_ev, + .link_ev = rnbd_srv_link_ev, + }; + rtrs_ctx = rtrs_srv_open(&rtrs_ops, port_nr); + if (IS_ERR(rtrs_ctx)) { + err = PTR_ERR(rtrs_ctx); + pr_err("rtrs_srv_open(), err: %d\n", err); + return err; + } + + err = rnbd_srv_create_sysfs_files(); + if (err) { + pr_err("rnbd_srv_create_sysfs_files(), err: %d\n", err); + rtrs_srv_close(rtrs_ctx); + return err; + } + + return 0; +} + +static void __exit rnbd_srv_cleanup_module(void) +{ + rtrs_srv_close(rtrs_ctx); + WARN_ON(!list_empty(&sess_list)); + rnbd_srv_destroy_sysfs_files(); +} + +module_init(rnbd_srv_init_module); +module_exit(rnbd_srv_cleanup_module); diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h new file mode 100644 index 000000000000..5a8544b5e74f --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_SRV_H +#define RNBD_SRV_H + +#include <linux/types.h> +#include <linux/idr.h> +#include <linux/kref.h> + +#include <rtrs.h> +#include "rnbd-proto.h" +#include "rnbd-log.h" + +struct rnbd_srv_session { + /* Entry inside global sess_list */ + struct list_head list; + struct rtrs_srv *rtrs; + char sessname[NAME_MAX]; + int queue_depth; + struct bio_set sess_bio_set; + + struct xarray index_idr; + /* List of struct rnbd_srv_sess_dev */ + struct list_head sess_dev_list; + struct mutex lock; + u8 ver; +}; + +struct rnbd_srv_dev { + /* Entry inside global dev_list */ + struct list_head list; + struct kobject dev_kobj; + struct kobject *dev_sessions_kobj; + struct kref kref; + char id[NAME_MAX]; + /* List of rnbd_srv_sess_dev structs */ + struct list_head sess_dev_list; + struct mutex lock; + int open_write_cnt; +}; + +/* Structure which binds N devices and N sessions */ +struct rnbd_srv_sess_dev { + /* Entry inside rnbd_srv_dev struct */ + struct list_head dev_list; + /* Entry inside rnbd_srv_session struct */ + struct list_head sess_list; + struct rnbd_dev *rnbd_dev; + struct rnbd_srv_session *sess; + struct rnbd_srv_dev *dev; + struct kobject kobj; + u32 device_id; + fmode_t open_flags; + struct kref kref; + struct completion *destroy_comp; + char pathname[NAME_MAX]; + enum rnbd_access_mode access_mode; +}; + +/* rnbd-srv-sysfs.c */ + +int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, + struct block_device *bdev, + const char *dir_name); +void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev); +int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); +int rnbd_srv_create_sysfs_files(void); +void rnbd_srv_destroy_sysfs_files(void); +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev); + +#endif /* RNBD_SRV_H */ diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 8ffa8260dcaf..3ba07ab30c84 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -96,20 +96,6 @@ static const struct block_device_operations rsxx_fops = { .ioctl = rsxx_blkdev_ioctl, }; -static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) -{ - generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio), - &card->gendisk->part0); -} - -static void disk_stats_complete(struct rsxx_cardinfo *card, - struct bio *bio, - unsigned long start_time) -{ - generic_end_io_acct(card->queue, bio_op(bio), - &card->gendisk->part0, start_time); -} - static void bio_dma_done_cb(struct rsxx_cardinfo *card, void *cb_data, unsigned int error) @@ -121,7 +107,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, if (atomic_dec_and_test(&meta->pending_dmas)) { if (!card->eeh_state && card->gendisk) - disk_stats_complete(card, meta->bio, meta->start_time); + bio_end_io_acct(meta->bio, meta->start_time); if (atomic_read(&meta->error)) bio_io_error(meta->bio); @@ -167,10 +153,9 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) bio_meta->bio = bio; atomic_set(&bio_meta->error, 0); atomic_set(&bio_meta->pending_dmas, 0); - bio_meta->start_time = jiffies; if (!unlikely(card->halt)) - disk_stats_start(card, bio); + bio_meta->start_time = bio_start_io_acct(bio); dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", bio_data_dir(bio) ? 'W' : 'R', bio_meta, diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 4c297f69171d..dd34504382e5 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -327,7 +327,7 @@ static inline void swim_motor(struct swim __iomem *base, swim_select(base, RELAX); if (swim_readbit(base, MOTOR_ON)) break; - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); } } else if (action == OFF) { @@ -346,7 +346,7 @@ static inline void swim_eject(struct swim __iomem *base) swim_select(base, RELAX); if (!swim_readbit(base, DISK_IN)) break; - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); } swim_select(base, RELAX); @@ -370,7 +370,7 @@ static inline int swim_step(struct swim __iomem *base) for (wait = 0; wait < HZ; wait++) { - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); swim_select(base, RELAX); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index d84e8a878df2..1e2aa5ae2796 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -784,7 +784,7 @@ static const struct block_device_operations mm_fops = { static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { - int ret = -ENODEV; + int ret; struct cardinfo *card = &cards[num_cards]; unsigned char mem_present; unsigned char batt_status; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f9b1e70f1b31..9d21bf0f155e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -15,6 +15,7 @@ #include <linux/blk-mq.h> #include <linux/blk-mq-virtio.h> #include <linux/numa.h> +#include <uapi/linux/virtio_ring.h> #define PART_BITS 4 #define VQ_NAME_LEN 16 @@ -32,6 +33,15 @@ struct virtio_blk_vq { } ____cacheline_aligned_in_smp; struct virtio_blk { + /* + * This mutex must be held by anything that may run after + * virtblk_remove() sets vblk->vdev to NULL. + * + * blk-mq, virtqueue processing, and sysfs attribute code paths are + * shut down before vblk->vdev is set to NULL and therefore do not need + * to hold this mutex. + */ + struct mutex vdev_mutex; struct virtio_device *vdev; /* The disk structure for the kernel. */ @@ -43,6 +53,13 @@ struct virtio_blk { /* Process context for config space updates */ struct work_struct config_work; + /* + * Tracks references from block_device_operations open/release and + * virtio_driver probe/remove so this object can be freed once no + * longer in use. + */ + refcount_t refs; + /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; @@ -294,10 +311,55 @@ out: return err; } +static void virtblk_get(struct virtio_blk *vblk) +{ + refcount_inc(&vblk->refs); +} + +static void virtblk_put(struct virtio_blk *vblk) +{ + if (refcount_dec_and_test(&vblk->refs)) { + ida_simple_remove(&vd_index_ida, vblk->index); + mutex_destroy(&vblk->vdev_mutex); + kfree(vblk); + } +} + +static int virtblk_open(struct block_device *bd, fmode_t mode) +{ + struct virtio_blk *vblk = bd->bd_disk->private_data; + int ret = 0; + + mutex_lock(&vblk->vdev_mutex); + + if (vblk->vdev) + virtblk_get(vblk); + else + ret = -ENXIO; + + mutex_unlock(&vblk->vdev_mutex); + return ret; +} + +static void virtblk_release(struct gendisk *disk, fmode_t mode) +{ + struct virtio_blk *vblk = disk->private_data; + + virtblk_put(vblk); +} + /* We provide getgeo only to please some old bootloader/partitioning tools */ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { struct virtio_blk *vblk = bd->bd_disk->private_data; + int ret = 0; + + mutex_lock(&vblk->vdev_mutex); + + if (!vblk->vdev) { + ret = -ENXIO; + goto out; + } /* see if the host passed in geometry config */ if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) { @@ -313,11 +375,15 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) geo->sectors = 1 << 5; geo->cylinders = get_capacity(bd->bd_disk) >> 11; } - return 0; +out: + mutex_unlock(&vblk->vdev_mutex); + return ret; } static const struct block_device_operations virtblk_fops = { .owner = THIS_MODULE, + .open = virtblk_open, + .release = virtblk_release, .getgeo = virtblk_getgeo, }; @@ -654,6 +720,10 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_free_index; } + /* This reference is dropped in virtblk_remove(). */ + refcount_set(&vblk->refs, 1); + mutex_init(&vblk->vdev_mutex); + vblk->vdev = vdev; vblk->sg_elems = sg_elems; @@ -819,8 +889,6 @@ out: static void virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; - int index = vblk->index; - int refc; /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); @@ -830,18 +898,21 @@ static void virtblk_remove(struct virtio_device *vdev) blk_mq_free_tag_set(&vblk->tag_set); + mutex_lock(&vblk->vdev_mutex); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); - refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref); + /* Virtqueues are stopped, nothing can use vblk->vdev anymore. */ + vblk->vdev = NULL; + put_disk(vblk->disk); vdev->config->del_vqs(vdev); kfree(vblk->vqs); - kfree(vblk); - /* Only free device id if we don't have any users */ - if (refc == 1) - ida_simple_remove(&vd_index_ida, index); + mutex_unlock(&vblk->vdev_mutex); + + virtblk_put(vblk); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 600430685e28..0e734802ee7c 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -35,10 +35,10 @@ #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/pgtable.h> #include <asm/setup.h> #include <asm/amigahw.h> -#include <asm/pgtable.h> #include <linux/zorro.h> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 1a8564a79d8d..33e3b76c4fa9 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -29,7 +29,6 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_ZSTD) "zstd", #endif - NULL }; static void zcomp_strm_free(struct zcomp_strm *zstrm) @@ -37,19 +36,16 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm) if (!IS_ERR_OR_NULL(zstrm->tfm)) crypto_free_comp(zstrm->tfm); free_pages((unsigned long)zstrm->buffer, 1); - kfree(zstrm); + zstrm->tfm = NULL; + zstrm->buffer = NULL; } /* - * allocate new zcomp_strm structure with ->tfm initialized by - * backend, return NULL on error + * Initialize zcomp_strm structure with ->tfm initialized by backend, and + * ->buffer. Return a negative value on error. */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); - if (!zstrm) - return NULL; - zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the @@ -58,16 +54,16 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { zcomp_strm_free(zstrm); - zstrm = NULL; + return -ENOMEM; } - return zstrm; + return 0; } bool zcomp_available_algorithm(const char *comp) { int i; - i = __sysfs_match_string(backends, -1, comp); + i = sysfs_match_string(backends, comp); if (i >= 0) return true; @@ -86,9 +82,9 @@ ssize_t zcomp_available_show(const char *comp, char *buf) { bool known_algorithm = false; ssize_t sz = 0; - int i = 0; + int i; - for (; backends[i]; i++) { + for (i = 0; i < ARRAY_SIZE(backends); i++) { if (!strcmp(comp, backends[i])) { known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, @@ -113,12 +109,13 @@ ssize_t zcomp_available_show(const char *comp, char *buf) struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { - return *get_cpu_ptr(comp->stream); + local_lock(&comp->stream->lock); + return this_cpu_ptr(comp->stream); } void zcomp_stream_put(struct zcomp *comp) { - put_cpu_ptr(comp->stream); + local_unlock(&comp->stream->lock); } int zcomp_compress(struct zcomp_strm *zstrm, @@ -159,17 +156,15 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) { struct zcomp *comp = hlist_entry(node, struct zcomp, node); struct zcomp_strm *zstrm; + int ret; - if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) - return 0; + zstrm = per_cpu_ptr(comp->stream, cpu); + local_lock_init(&zstrm->lock); - zstrm = zcomp_strm_alloc(comp); - if (IS_ERR_OR_NULL(zstrm)) { + ret = zcomp_strm_init(zstrm, comp); + if (ret) pr_err("Can't allocate a compression stream\n"); - return -ENOMEM; - } - *per_cpu_ptr(comp->stream, cpu) = zstrm; - return 0; + return ret; } int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) @@ -177,10 +172,8 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) struct zcomp *comp = hlist_entry(node, struct zcomp, node); struct zcomp_strm *zstrm; - zstrm = *per_cpu_ptr(comp->stream, cpu); - if (!IS_ERR_OR_NULL(zstrm)) - zcomp_strm_free(zstrm); - *per_cpu_ptr(comp->stream, cpu) = NULL; + zstrm = per_cpu_ptr(comp->stream, cpu); + zcomp_strm_free(zstrm); return 0; } @@ -188,7 +181,7 @@ static int zcomp_init(struct zcomp *comp) { int ret; - comp->stream = alloc_percpu(struct zcomp_strm *); + comp->stream = alloc_percpu(struct zcomp_strm); if (!comp->stream) return -ENOMEM; diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 1806475b919d..40f6420f4b2e 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -5,8 +5,11 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ +#include <linux/local_lock.h> struct zcomp_strm { + /* The members ->buffer and ->tfm are protected by ->lock. */ + local_lock_t lock; /* compression/decompression buffer */ void *buffer; struct crypto_comp *tfm; @@ -14,7 +17,7 @@ struct zcomp_strm { /* dynamic per-device compression frontend */ struct zcomp { - struct zcomp_strm * __percpu *stream; + struct zcomp_strm __percpu *stream; const char *name; struct hlist_node node; }; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ebb234f36909..6e2ad90b17a3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1510,13 +1510,8 @@ static void zram_bio_discard(struct zram *zram, u32 index, static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, unsigned int op, struct bio *bio) { - unsigned long start_time = jiffies; - struct request_queue *q = zram->disk->queue; int ret; - generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT, - &zram->disk->part0); - if (!op_is_write(op)) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); @@ -1526,8 +1521,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset, bio); } - generic_end_io_acct(q, op, &zram->disk->part0, start_time); - zram_slot_lock(zram, index); zram_accessed(zram, index); zram_slot_unlock(zram, index); @@ -1548,6 +1541,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) u32 index; struct bio_vec bvec; struct bvec_iter iter; + unsigned long start_time; index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; offset = (bio->bi_iter.bi_sector & @@ -1563,6 +1557,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) break; } + start_time = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { struct bio_vec bv = bvec; unsigned int unwritten = bvec.bv_len; @@ -1571,8 +1566,10 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, unwritten); if (zram_bvec_rw(zram, &bv, index, offset, - bio_op(bio), bio) < 0) - goto out; + bio_op(bio), bio) < 0) { + bio->bi_status = BLK_STS_IOERR; + break; + } bv.bv_offset += bv.bv_len; unwritten -= bv.bv_len; @@ -1580,12 +1577,8 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) update_position(&index, &offset, &bv); } while (unwritten); } - + bio_end_io_acct(bio, start_time); bio_endio(bio); - return; - -out: - bio_io_error(bio); } /* @@ -1633,6 +1626,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, u32 index; struct zram *zram; struct bio_vec bv; + unsigned long start_time; if (PageTransHuge(page)) return -ENOTSUPP; @@ -1651,7 +1645,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; + start_time = disk_start_io_acct(bdev->bd_disk, SECTORS_PER_PAGE, op); ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); + disk_end_io_acct(bdev->bd_disk, op, start_time); out: /* * If I/O fails, just return error(ie, non-zero) without |