diff options
34 files changed, 479 insertions, 342 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index e17ebf70b548..3e5a5d263f29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8096,6 +8096,16 @@ F: include/linux/iommu.h F: include/linux/of_iommu.h F: include/linux/iova.h +IO_URING +M: Jens Axboe <axboe@kernel.dk> +L: linux-block@vger.kernel.org +L: linux-fsdevel@vger.kernel.org +T: git git://git.kernel.dk/linux-block +T: git git://git.kernel.dk/liburing +S: Maintained +F: fs/io_uring.c +F: include/uapi/linux/io_uring.h + IP MASQUERADING M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> S: Maintained diff --git a/block/bio.c b/block/bio.c index 71a78d9fb8b7..b64cedc7f87c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) size = bio_add_page(bio, bv->bv_page, len, bv->bv_offset + iter->iov_offset); if (size == len) { - struct page *page; - int i; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct page *page; + int i; + + mp_bvec_for_each_page(page, bv, i) + get_page(page); + } - /* - * For the normal O_DIRECT case, we could skip grabbing this - * reference and then not have to put them again when IO - * completes. But this breaks some in-kernel users, like - * splicing to/from a loop device, where we release the pipe - * pages unconditionally. If we can fix that case, we can - * get rid of the get here and the need to call - * bio_release_pages() at IO completion time. - */ - mp_bvec_for_each_page(page, bv, i) - get_page(page); iov_iter_advance(iter, size); return 0; } @@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. For now, when adding kernel pages, we still grab a reference to the - * page. This isn't strictly needed for the common case, but some call paths - * end up releasing pages from eg a pipe and we can't easily control these. - * See comment in __bio_iov_bvec_add_pages(). + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in *iter, whatever is smaller. If @@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) const bool is_bvec = iov_iter_is_bvec(iter); unsigned short orig_vcnt = bio->bi_vcnt; + /* + * If this is a BVEC iter, then the pages are kernel pages. Don't + * release them on IO completion, if the caller asked us to. + */ + if (is_bvec && iov_iter_bvec_no_ref(iter)) + bio_set_flag(bio, BIO_NO_PAGE_REF); + do { int ret; @@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work) next = bio->bi_private; bio_set_pages_dirty(bio); - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); } } @@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio) goto defer; } - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); return; defer: diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 77f37ef8ef06..617a2b3f7582 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1736,8 +1736,8 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q - the request queue IO was submitted on - * @use_memdelay - do we charge this to memory delay for PSI + * @q: the request queue IO was submitted on + * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places @@ -1769,8 +1769,9 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) /** * blkcg_add_delay - add delay to this blkg - * @now - the current time in nanoseconds - * @delta - how many nanoseconds of delay to add + * @blkg: blkg of interest + * @now: the current time in nanoseconds + * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2620baa1f699..507212d75ee2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -75,6 +75,7 @@ #include <linux/blk-mq.h> #include "blk-rq-qos.h" #include "blk-stat.h" +#include "blk.h" #define DEFAULT_SCALE_COOKIE 1000000U diff --git a/block/blk-mq.c b/block/blk-mq.c index a9c181603cbd..70b210a308c4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -782,7 +782,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, if (kick_requeue_list) blk_mq_kick_requeue_list(q); } -EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { @@ -1093,8 +1092,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the @@ -2857,7 +2855,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* * Default to classic polling */ - q->poll_nsec = -1; + q->poll_nsec = BLK_MQ_POLL_CLASSIC; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); @@ -3392,7 +3390,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, { struct request *rq; - if (q->poll_nsec == -1) + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) return false; if (!blk_qc_t_is_internal(cookie)) diff --git a/block/blk-mq.h b/block/blk-mq.h index c11353a3749d..0ed8e5a8729f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -41,6 +41,8 @@ void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 59685918167e..422327089e0f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -360,8 +360,8 @@ static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { int val; - if (q->poll_nsec == -1) - val = -1; + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) + val = BLK_MQ_POLL_CLASSIC; else val = q->poll_nsec / 1000; @@ -380,10 +380,12 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, if (err < 0) return err; - if (val == -1) - q->poll_nsec = -1; - else + if (val == BLK_MQ_POLL_CLASSIC) + q->poll_nsec = BLK_MQ_POLL_CLASSIC; + else if (val >= 0) q->poll_nsec = val * 1000; + else + return -EINVAL; return count; } diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 57410f9c5d44..c52c738e554a 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -164,9 +164,7 @@ config ARM_CHARLCD line and the Linux version on the second line, but that's still useful. -endif # AUXDISPLAY - -menuconfig PANEL +menuconfig PARPORT_PANEL tristate "Parallel port LCD/Keypad Panel support" depends on PARPORT select CHARLCD @@ -178,7 +176,7 @@ menuconfig PANEL compiled as a module, or linked into the kernel and started at boot. If you don't understand what all this is about, say N. -if PANEL +if PARPORT_PANEL config PANEL_PARPORT int "Default parallel port number (0=LPT1)" @@ -419,8 +417,11 @@ config PANEL_LCD_PIN_BL Default for the 'BL' pin in custom profile is '0' (uncontrolled). +endif # PARPORT_PANEL + config PANEL_CHANGE_MESSAGE bool "Change LCD initialization message ?" + depends on CHARLCD default "n" ---help--- This allows you to replace the boot message indicating the kernel version @@ -444,7 +445,34 @@ config PANEL_BOOT_MESSAGE An empty message will only clear the display at driver init time. Any other printf()-formatted message is valid with newline and escape codes. -endif # PANEL +choice + prompt "Backlight initial state" + default CHARLCD_BL_FLASH + + config CHARLCD_BL_OFF + bool "Off" + help + Backlight is initially turned off + + config CHARLCD_BL_ON + bool "On" + help + Backlight is initially turned on + + config CHARLCD_BL_FLASH + bool "Flash" + help + Backlight is flashed briefly on init + +endchoice + +endif # AUXDISPLAY + +config PANEL + tristate "Parallel port LCD/Keypad Panel support (OLD OPTION)" + depends on PARPORT + select AUXDISPLAY + select PARPORT_PANEL config CHARLCD tristate "Character LCD core support" if COMPILE_TEST diff --git a/drivers/auxdisplay/Makefile b/drivers/auxdisplay/Makefile index 7ac6776ca3f6..cf54b5efb07e 100644 --- a/drivers/auxdisplay/Makefile +++ b/drivers/auxdisplay/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_CFAG12864B) += cfag12864b.o cfag12864bfb.o obj-$(CONFIG_IMG_ASCII_LCD) += img-ascii-lcd.o obj-$(CONFIG_HD44780) += hd44780.o obj-$(CONFIG_HT16K33) += ht16k33.o -obj-$(CONFIG_PANEL) += panel.o +obj-$(CONFIG_PARPORT_PANEL) += panel.o diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 60e0b772673f..92745efefb54 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -91,7 +91,7 @@ struct charlcd_priv { unsigned long long drvdata[0]; }; -#define to_priv(p) container_of(p, struct charlcd_priv, lcd) +#define charlcd_to_priv(p) container_of(p, struct charlcd_priv, lcd) /* Device single-open policy control */ static atomic_t charlcd_available = ATOMIC_INIT(1); @@ -105,7 +105,7 @@ static void long_sleep(int ms) /* turn the backlight on or off */ static void charlcd_backlight(struct charlcd *lcd, int on) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (!lcd->ops->backlight) return; @@ -134,7 +134,7 @@ static void charlcd_bl_off(struct work_struct *work) /* turn the backlight on for a little while */ void charlcd_poke(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (!lcd->ops->backlight) return; @@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(charlcd_poke); static void charlcd_gotoxy(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); unsigned int addr; /* @@ -170,7 +170,7 @@ static void charlcd_gotoxy(struct charlcd *lcd) static void charlcd_home(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); priv->addr.x = 0; priv->addr.y = 0; @@ -179,7 +179,7 @@ static void charlcd_home(struct charlcd *lcd) static void charlcd_print(struct charlcd *lcd, char c) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (priv->addr.x < lcd->bwidth) { if (lcd->char_conv) @@ -211,7 +211,7 @@ static void charlcd_clear_fast(struct charlcd *lcd) /* clears the display and resets X/Y */ static void charlcd_clear_display(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); lcd->ops->write_cmd(lcd, LCD_CMD_DISPLAY_CLEAR); priv->addr.x = 0; @@ -223,7 +223,7 @@ static void charlcd_clear_display(struct charlcd *lcd) static int charlcd_init_display(struct charlcd *lcd) { void (*write_cmd_raw)(struct charlcd *lcd, int cmd); - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); u8 init; if (lcd->ifwidth != 4 && lcd->ifwidth != 8) @@ -369,7 +369,7 @@ static bool parse_xy(const char *s, unsigned long *x, unsigned long *y) static inline int handle_lcd_special_code(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); /* LCD special codes */ @@ -580,7 +580,7 @@ static inline int handle_lcd_special_code(struct charlcd *lcd) static void charlcd_write_char(struct charlcd *lcd, char c) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); /* first, we'll test if we're in escape mode */ if ((c != '\n') && priv->esc_seq.len >= 0) { @@ -705,7 +705,7 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf, static int charlcd_open(struct inode *inode, struct file *file) { - struct charlcd_priv *priv = to_priv(the_charlcd); + struct charlcd_priv *priv = charlcd_to_priv(the_charlcd); int ret; ret = -EBUSY; @@ -763,10 +763,24 @@ static void charlcd_puts(struct charlcd *lcd, const char *s) } } +#ifdef CONFIG_PANEL_BOOT_MESSAGE +#define LCD_INIT_TEXT CONFIG_PANEL_BOOT_MESSAGE +#else +#define LCD_INIT_TEXT "Linux-" UTS_RELEASE "\n" +#endif + +#ifdef CONFIG_CHARLCD_BL_ON +#define LCD_INIT_BL "\x1b[L+" +#elif defined(CONFIG_CHARLCD_BL_FLASH) +#define LCD_INIT_BL "\x1b[L*" +#else +#define LCD_INIT_BL "\x1b[L-" +#endif + /* initialize the LCD driver */ static int charlcd_init(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); int ret; if (lcd->ops->backlight) { @@ -784,13 +798,8 @@ static int charlcd_init(struct charlcd *lcd) return ret; /* display a short message */ -#ifdef CONFIG_PANEL_CHANGE_MESSAGE -#ifdef CONFIG_PANEL_BOOT_MESSAGE - charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*" CONFIG_PANEL_BOOT_MESSAGE); -#endif -#else - charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*Linux-" UTS_RELEASE "\n"); -#endif + charlcd_puts(lcd, "\x1b[Lc\x1b[Lb" LCD_INIT_BL LCD_INIT_TEXT); + /* clear the display on the next device opening */ priv->must_clear = true; charlcd_home(lcd); @@ -818,6 +827,12 @@ struct charlcd *charlcd_alloc(unsigned int drvdata_size) } EXPORT_SYMBOL_GPL(charlcd_alloc); +void charlcd_free(struct charlcd *lcd) +{ + kfree(charlcd_to_priv(lcd)); +} +EXPORT_SYMBOL_GPL(charlcd_free); + static int panel_notify_sys(struct notifier_block *this, unsigned long code, void *unused) { @@ -866,7 +881,7 @@ EXPORT_SYMBOL_GPL(charlcd_register); int charlcd_unregister(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); unregister_reboot_notifier(&panel_notifier); charlcd_puts(lcd, "\x0cLCD driver unloaded.\x1b[Lc\x1b[Lb\x1b[L-"); diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 9ad93ea42fdc..ab15b64707ad 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -271,7 +271,7 @@ static int hd44780_probe(struct platform_device *pdev) return 0; fail: - kfree(lcd); + charlcd_free(lcd); return ret; } @@ -280,6 +280,8 @@ static int hd44780_remove(struct platform_device *pdev) struct charlcd *lcd = platform_get_drvdata(pdev); charlcd_unregister(lcd); + + charlcd_free(lcd); return 0; } diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index 21b9b2f2470a..e06de63497cf 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1620,7 +1620,7 @@ err_lcd_unreg: if (lcd.enabled) charlcd_unregister(lcd.charlcd); err_unreg_device: - kfree(lcd.charlcd); + charlcd_free(lcd.charlcd); lcd.charlcd = NULL; parport_unregister_device(pprt); pprt = NULL; @@ -1647,7 +1647,7 @@ static void panel_detach(struct parport *port) if (lcd.enabled) { charlcd_unregister(lcd.charlcd); lcd.initialized = false; - kfree(lcd.charlcd); + charlcd_free(lcd.charlcd); lcd.charlcd = NULL; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1e6edd568214..bf1c61cab8eb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -656,7 +656,7 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return -EBADF; l = f->f_mapping->host->i_bdev->bd_disk->private_data; - if (l->lo_state == Lo_unbound) { + if (l->lo_state != Lo_bound) { return -EINVAL; } f = l->lo_backing_file; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 96670eefaeb2..377a694dc228 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -749,8 +749,12 @@ static int pcd_detect(void) return 0; printk("%s: No CD-ROM drive found\n", name); - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + blk_cleanup_queue(cd->disk->queue); + cd->disk->queue = NULL; + blk_mq_free_tag_set(&cd->tag_set); put_disk(cd->disk); + } pi_unregister_driver(par_drv); return -1; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index e92e7a8eeeb2..103b617cdc31 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -761,8 +761,12 @@ static int pf_detect(void) return 0; printk("%s: No ATAPI disk detected\n", name); - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { + blk_cleanup_queue(pf->disk->queue); + pf->disk->queue = NULL; + blk_mq_free_tag_set(&pf->tag_set); put_disk(pf->disk); + } pi_unregister_driver(par_drv); return -1; } @@ -1047,13 +1051,15 @@ static void __exit pf_exit(void) int unit; unregister_blkdev(major, name); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - if (!pf->present) - continue; - del_gendisk(pf->disk); + if (pf->present) + del_gendisk(pf->disk); + blk_cleanup_queue(pf->disk->queue); blk_mq_free_tag_set(&pf->tag_set); put_disk(pf->disk); - pi_release(pf->pi); + + if (pf->present) + pi_release(pf->pi); } } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ba967d65cf9..2210c1b9491b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -833,7 +833,7 @@ static int parse_rbd_opts_token(char *c, void *private) pctx->opts->queue_depth = intval; break; case Opt_alloc_size: - if (intval < 1) { + if (intval < SECTOR_SIZE) { pr_err("alloc_size out of range\n"); return -EINVAL; } @@ -924,23 +924,6 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } -static int wait_for_latest_osdmap(struct ceph_client *client) -{ - u64 newest_epoch; - int ret; - - ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); - if (ret) - return ret; - - if (client->osdc.osdmap->epoch >= newest_epoch) - return 0; - - ceph_osdc_maybe_request_map(&client->osdc); - return ceph_monc_wait_osdmap(&client->monc, newest_epoch, - client->options->mount_timeout); -} - /* * Get a ceph client with specific addr and configuration, if one does * not exist create it. Either way, ceph_opts is consumed by this @@ -960,7 +943,8 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) * Using an existing client. Make sure ->pg_pools is up to * date before we look up the pool id in do_rbd_add(). */ - ret = wait_for_latest_osdmap(rbdc->client); + ret = ceph_wait_for_latest_osdmap(rbdc->client, + rbdc->client->options->mount_timeout); if (ret) { rbd_warn(NULL, "failed to get latest osdmap: %d", ret); rbd_put_client(rbdc); @@ -4203,12 +4187,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) q->limits.max_sectors = queue_max_hw_sectors(q); blk_queue_max_segments(q, USHRT_MAX); blk_queue_max_segment_size(q, UINT_MAX); - blk_queue_io_min(q, objset_bytes); - blk_queue_io_opt(q, objset_bytes); + blk_queue_io_min(q, rbd_dev->opts->alloc_size); + blk_queue_io_opt(q, rbd_dev->opts->alloc_size); if (rbd_dev->opts->trim) { blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - q->limits.discard_granularity = objset_bytes; + q->limits.discard_granularity = rbd_dev->opts->alloc_size; blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); } diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 3c3cf89f713f..14bac4966c87 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1801,6 +1801,12 @@ static int hisi_sas_I_T_nexus_reset(struct domain_device *device) } hisi_sas_dereg_device(hisi_hba, device); + if (dev_is_sata(device)) { + rc = hisi_sas_softreset_ata_disk(device); + if (rc) + return TMF_RESP_FUNC_FAILED; + } + rc = hisi_sas_debug_I_T_nexus_reset(device); if ((rc == TMF_RESP_FUNC_COMPLETE) || (rc == -ENODEV)) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 1135e74646e2..8cec5230fe31 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -96,6 +96,7 @@ static int client_reserve = 1; static char partition_name[96] = "UNKNOWN"; static unsigned int partition_number = -1; static LIST_HEAD(ibmvscsi_head); +static DEFINE_SPINLOCK(ibmvscsi_driver_lock); static struct scsi_transport_template *ibmvscsi_transport_template; @@ -2270,7 +2271,9 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) } dev_set_drvdata(&vdev->dev, hostdata); + spin_lock(&ibmvscsi_driver_lock); list_add_tail(&hostdata->host_list, &ibmvscsi_head); + spin_unlock(&ibmvscsi_driver_lock); return 0; add_srp_port_failed: @@ -2292,15 +2295,27 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); - list_del(&hostdata->host_list); - unmap_persist_bufs(hostdata); + unsigned long flags; + + srp_remove_host(hostdata->host); + scsi_remove_host(hostdata->host); + + purge_requests(hostdata, DID_ERROR); + + spin_lock_irqsave(hostdata->host->host_lock, flags); release_event_pool(&hostdata->pool, hostdata); + spin_unlock_irqrestore(hostdata->host->host_lock, flags); + ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, max_events); kthread_stop(hostdata->work_thread); - srp_remove_host(hostdata->host); - scsi_remove_host(hostdata->host); + unmap_persist_bufs(hostdata); + + spin_lock(&ibmvscsi_driver_lock); + list_del(&hostdata->host_list); + spin_unlock(&ibmvscsi_driver_lock); + scsi_host_put(hostdata->host); return 0; diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 420045155ba0..0c700b140ce7 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -4991,6 +4991,13 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha) if ((domain & 0xf0) == 0xf0) continue; + /* Bypass if not same domain and area of adapter. */ + if (area && domain && ((area != vha->d_id.b.area) || + (domain != vha->d_id.b.domain)) && + (ha->current_topology == ISP_CFG_NL)) + continue; + + /* Bypass invalid local loop ID. */ if (loop_id > LAST_LOCAL_LOOP_ID) continue; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 677f82fdf56f..91f576d743fe 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1517,7 +1517,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type, goto eh_reset_failed; } err = 2; - if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1) + if (do_reset(fcport, cmd->device->lun, 1) != QLA_SUCCESS) { ql_log(ql_log_warn, vha, 0x800c, "do_reset failed for cmd=%p.\n", cmd); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 20189675677a..601b9f1de267 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -585,10 +585,17 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (!blk_rq_is_scsi(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); cmd->flags &= ~SCMD_INITIALIZED; - destroy_rcu_head(&cmd->rcu); } /* + * Calling rcu_barrier() is not necessary here because the + * SCSI error handler guarantees that the function called by + * call_rcu() has been called before scsi_end_request() is + * called. + */ + destroy_rcu_head(&cmd->rcu); + + /* * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * @@ -2541,8 +2548,10 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - sdev->quiesced_by = NULL; - blk_clear_pm_only(sdev->request_queue); + if (sdev->quiesced_by) { + sdev->quiesced_by = NULL; + blk_clear_pm_only(sdev->request_queue); + } if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0508831d6fb9..0a82e93566dc 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2200,6 +2200,8 @@ void iscsi_remove_session(struct iscsi_cls_session *session) scsi_target_unblock(&session->dev, SDEV_TRANSPORT_OFFLINE); /* flush running scans then delete devices */ flush_work(&session->scan_work); + /* flush running unbind operations */ + flush_work(&session->unbind_work); __iscsi_unbind_session(&session->unbind_work); /* hw iscsi may not have removed all connections from session */ diff --git a/fs/block_dev.c b/fs/block_dev.c index e9faa52bb489..78d3257435c0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/fs/io_uring.c b/fs/io_uring.c index c88088d92613..6aaa30580a2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -189,17 +189,28 @@ struct sqe_submit { bool needs_fixed_file; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool canceled; struct wait_queue_entry wait; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct io_kiocb { union { + struct file *file; struct kiocb rw; struct io_poll_iocb poll; }; @@ -214,6 +225,7 @@ struct io_kiocb { #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_PREPPED 16 /* prep already done */ u64 user_data; u64 error; @@ -355,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, } } -static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned ev_flags) { unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_cqring_fill_event(ctx, user_data, res, ev_flags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + io_cqring_ev_posted(ctx); } static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) @@ -382,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; if (!percpu_ref_tryget(&ctx->refs)) return NULL; if (!state) { - req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) goto out; } else if (!state->free_reqs) { @@ -396,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, int ret; sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); - ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, - state->reqs); - if (unlikely(ret <= 0)) - goto out; + ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); + + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + goto out; + ret = 1; + } state->free_reqs = ret - 1; state->cur_req = 1; req = state->reqs[0]; @@ -411,7 +437,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->ctx = ctx; req->flags = 0; - refcount_set(&req->refs, 0); + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); return req; out: io_ring_drop_ctx_refs(ctx, 1); @@ -429,10 +456,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void io_free_req(struct io_kiocb *req) { - if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { - io_ring_drop_ctx_refs(req->ctx, 1); - kmem_cache_free(req_cachep, req); - } + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + fput(req->file); + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void io_put_req(struct io_kiocb *req) +{ + if (refcount_dec_and_test(&req->refs)) + io_free_req(req); } /* @@ -442,44 +475,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { void *reqs[IO_IOPOLL_BATCH]; - int file_count, to_free; - struct file *file = NULL; struct io_kiocb *req; + int to_free; - file_count = to_free = 0; + to_free = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); io_cqring_fill_event(ctx, req->user_data, req->error, 0); - - reqs[to_free++] = req; (*nr_events)++; - /* - * Batched puts of the same file, to avoid dirtying the - * file usage count multiple times, if avoidable. - */ - if (!(req->flags & REQ_F_FIXED_FILE)) { - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; + if (refcount_dec_and_test(&req->refs)) { + /* If we're not using fixed files, we have to pair the + * completion part with the file put. Use regular + * completions for those, only batch free for fixed + * file. + */ + if (req->flags & REQ_F_FIXED_FILE) { + reqs[to_free++] = req; + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + io_free_req(req); } } - - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); } - io_commit_cqring(ctx); - if (file) - fput_many(file, file_count); + io_commit_cqring(ctx); io_free_req_many(ctx, reqs, &to_free); } @@ -602,21 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb) } } -static void io_fput(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_FIXED_FILE)) - fput(req->rw.ki_filp); -} - static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); - io_free_req(req); + io_put_req(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -731,31 +747,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio, flags; - int fd, ret; + unsigned ioprio; + int ret; + if (!req->file) + return -EBADF; /* For -EAGAIN retry, everything is already prepped */ - if (kiocb->ki_filp) + if (req->flags & REQ_F_PREPPED) return 0; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); + if (force_nonblock && !io_file_supports_async(req->file)) + force_nonblock = false; - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; - kiocb->ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - if (s->needs_fixed_file) - return -EBADF; - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; - } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -764,7 +767,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (ioprio) { ret = ioprio_check_cap(ioprio); if (ret) - goto out_fput; + return ret; kiocb->ki_ioprio = ioprio; } else @@ -772,38 +775,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) - goto out_fput; + return ret; if (force_nonblock) { kiocb->ki_flags |= IOCB_NOWAIT; req->flags |= REQ_F_FORCE_NONBLOCK; } if (ctx->flags & IORING_SETUP_IOPOLL) { - ret = -EOPNOTSUPP; if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) - goto out_fput; + return -EOPNOTSUPP; req->error = 0; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; } else { - if (kiocb->ki_flags & IOCB_HIPRI) { - ret = -EINVAL; - goto out_fput; - } + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; kiocb->ki_complete = io_complete_rw; } + req->flags |= REQ_F_PREPPED; return 0; -out_fput: - if (!(flags & IOSQE_FIXED_FILE)) { - /* - * in case of error, we didn't use this file reference. drop it. - */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); - } - return ret; } static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) @@ -864,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); if (offset) iov_iter_advance(iter, offset); + + /* don't drop a reference to these pages */ + iter->type |= ITER_BVEC_FLAG_NO_REF; return 0; } @@ -887,7 +881,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + int ret = io_import_fixed(ctx, rw, sqe, iter); *iovec = NULL; return ret; } @@ -945,31 +939,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) async_list->io_end = io_end; } -static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; file = kiocb->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -991,38 +983,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, } } kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } -static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; - ret = -EBADF; file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); @@ -1054,10 +1040,6 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, } out_free: kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } @@ -1072,29 +1054,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - /* - * Twilight zone - it's possible that someone issued an opcode that - * has a file attached, then got -EAGAIN on submission, and changed - * the sqe before we retried it from async context. Avoid dropping - * a file reference for this malicious case, and flag the error. - */ - if (req->rw.ki_filp) { - err = -EBADF; - io_fput(req); - } io_cqring_add_event(ctx, user_data, err, 0); - io_free_req(req); + io_put_req(req); return 0; } static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; - /* Prep already done */ - if (req->rw.ki_filp) + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) return 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) @@ -1102,20 +1074,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - fd = READ_ONCE(sqe->fd); - flags = READ_ONCE(sqe->flags); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - req->rw.ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - req->rw.ki_filp = fget(fd); - if (unlikely(!req->rw.ki_filp)) - return -EBADF; - } - + req->flags |= REQ_F_PREPPED; return 0; } @@ -1144,9 +1103,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } @@ -1204,15 +1162,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, + __poll_t mask) { - io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); - io_fput(req); - io_free_req(req); + req->poll.done = true; + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); + io_commit_cqring(ctx); } static void io_poll_complete_work(struct work_struct *work) @@ -1240,9 +1199,11 @@ static void io_poll_complete_work(struct work_struct *work) return; } list_del_init(&req->list); + io_poll_complete(ctx, req, mask); spin_unlock_irq(&ctx->completion_lock); - io_poll_complete(req, mask); + io_cqring_ev_posted(ctx); + io_put_req(req); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -1253,29 +1214,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - - poll->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - unsigned long flags; + if (mask && !(mask & poll->events)) + return 0; - if (!(mask & poll->events)) - return 0; + list_del_init(&poll->wait.entry); - /* try to complete the iocb inline if we can: */ - if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + io_poll_complete(ctx, req, mask); + spin_unlock_irqrestore(&ctx->completion_lock, flags); - list_del_init(&poll->wait.entry); - io_poll_complete(req, mask); - return 1; - } + io_cqring_ev_posted(ctx); + io_put_req(req); + } else { + queue_work(ctx->sqo_wq, &req->work); } - list_del_init(&poll->wait.entry); - queue_work(ctx->sqo_wq, &req->work); return 1; } @@ -1305,36 +1262,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - unsigned flags; + bool cancel = false; __poll_t mask; u16 events; - int fd; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) return -EINVAL; + if (!poll->file) + return -EBADF; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - poll->file = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - poll->file = fget(fd); - } - if (unlikely(!poll->file)) - return -EBADF; - poll->head = NULL; - poll->woken = false; + poll->done = false; poll->canceled = false; ipt.pt._qproc = io_poll_queue_proc; @@ -1346,56 +1290,44 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&req->refs, 2); - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; - if (unlikely(!poll->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } spin_lock_irq(&ctx->completion_lock); - spin_lock(&poll->head->lock); - if (poll->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt.error) + cancel = true; + ipt.error = 0; + mask = 0; + } + if (mask || ipt.error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + spin_unlock(&poll->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + req->error = mangle_poll(mask); ipt.error = 0; - } else if (mask || ipt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&poll->wait.entry)); - list_del_init(&poll->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + io_poll_complete(ctx, req, mask); } - spin_unlock(&poll->head->lock); spin_unlock_irq(&ctx->completion_lock); -out: - if (unlikely(ipt.error)) { - if (!(flags & IOSQE_FIXED_FILE)) - fput(poll->file); - /* - * Drop one of our refs to this req, __io_submit_sqe() will - * drop the other one since we're returning an error. - */ - io_free_req(req); - return ipt.error; + if (mask) { + io_cqring_ev_posted(ctx); + io_put_req(req); } - - if (mask) - io_poll_complete(req, mask); - io_free_req(req); - return 0; + return ipt.error; } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) { - ssize_t ret; - int opcode; + int ret, opcode; if (unlikely(s->index >= ctx->sq_entries)) return -EINVAL; @@ -1524,10 +1456,13 @@ restart: break; cond_resched(); } while (1); + + /* drop submission reference */ + io_put_req(req); } if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); } /* async context always use a copy of the sqe */ @@ -1614,11 +1549,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) return ret; } +static bool io_op_needs_file(const struct io_uring_sqe *sqe) +{ + int op = READ_ONCE(sqe->opcode); + + switch (op) { + case IORING_OP_NOP: + case IORING_OP_POLL_REMOVE: + return false; + default: + return true; + } +} + +static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb *req) +{ + unsigned flags; + int fd; + + flags = READ_ONCE(s->sqe->flags); + fd = READ_ONCE(s->sqe->fd); + + if (!io_op_needs_file(s->sqe)) { + req->file = NULL; + return 0; + } + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + req->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + if (s->needs_fixed_file) + return -EBADF; + req->file = io_file_get(state, fd); + if (unlikely(!req->file)) + return -EBADF; + } + + return 0; +} + static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, struct io_submit_state *state) { struct io_kiocb *req; - ssize_t ret; + int ret; /* enforce forwards compatibility on users */ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) @@ -1628,7 +1607,9 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(!req)) return -EAGAIN; - req->rw.ki_filp = NULL; + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) + goto out; ret = __io_submit_sqe(ctx, req, s, true, state); if (ret == -EAGAIN) { @@ -1649,11 +1630,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, INIT_WORK(&req->work, io_sq_wq_submit_work); queue_work(ctx->sqo_wq, &req->work); } - ret = 0; + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually + * submitted. + */ + return 0; } } + +out: + /* drop submission reference */ + io_put_req(req); + + /* and drop final reference, if we failed */ if (ret) - io_free_req(req); + io_put_req(req); return ret; } diff --git a/fs/iomap.c b/fs/iomap.c index 97cb9d486a7d..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b0c814bcc7e3..cb2aa7ecafff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -57,7 +57,6 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; - unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; @@ -300,8 +299,6 @@ void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d66bf5f32610..791fee35df88 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -215,6 +215,7 @@ struct bio { /* * bio flags */ +#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0de92b29f589..5c58a3b2bf00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -50,6 +50,9 @@ struct blk_stat_callback; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 +/* Doing classic polling */ +#define BLK_MQ_POLL_CLASSIC -1 + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index a420c07904bc..337d5049ff93 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -294,6 +294,8 @@ extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 14d558146aea..20f3e3f029b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with - * the caller doing sbitmap_batch_clear() if a given index is full, which + * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) diff --git a/include/linux/uio.h b/include/linux/uio.h index 87477e1640f9..f184af1999a8 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -23,14 +23,23 @@ struct kvec { }; enum iter_type { - ITER_IOVEC = 0, - ITER_KVEC = 2, - ITER_BVEC = 4, - ITER_PIPE = 8, - ITER_DISCARD = 16, + /* set if ITER_BVEC doesn't hold a bv_page ref */ + ITER_BVEC_FLAG_NO_REF = 2, + + /* iter types */ + ITER_IOVEC = 4, + ITER_KVEC = 8, + ITER_BVEC = 16, + ITER_PIPE = 32, + ITER_DISCARD = 64, }; struct iov_iter { + /* + * Bit 0 is the read/write bit, set if we're writing. + * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and + * the caller isn't expecting to drop a page reference when done. + */ unsigned int type; size_t iov_offset; size_t count; @@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) return i->type & (READ | WRITE); } +static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) +{ + return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; +} + /* * Total number of bytes covered by an iovec. * diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h index 23f61850f363..1832402324ce 100644 --- a/include/misc/charlcd.h +++ b/include/misc/charlcd.h @@ -35,6 +35,7 @@ struct charlcd_ops { }; struct charlcd *charlcd_alloc(unsigned int drvdata_size); +void charlcd_free(struct charlcd *lcd); int charlcd_register(struct charlcd *lcd); int charlcd_unregister(struct charlcd *lcd); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 9cab80207ced..79eac465ec65 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -738,7 +738,6 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) } EXPORT_SYMBOL(__ceph_open_session); - int ceph_open_session(struct ceph_client *client) { int ret; @@ -754,6 +753,23 @@ int ceph_open_session(struct ceph_client *client) } EXPORT_SYMBOL(ceph_open_session); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout); +} +EXPORT_SYMBOL(ceph_wait_for_latest_osdmap); static int __init init_ceph_lib(void) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 18deb3d889c4..a53e4fbb6319 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -922,6 +922,15 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc, mutex_unlock(&monc->mutex); ret = wait_generic_request(req); + if (!ret) + /* + * Make sure we have the osdmap that includes the blacklist + * entry. This is needed to ensure that the OSDs pick up the + * new blacklist before processing any future requests from + * this client. + */ + ret = ceph_wait_for_latest_osdmap(monc->client, 0); + out: put_generic_request(req); return ret; |