diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 22:12:44 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 22:12:44 +0300 |
commit | 3ad11d7ac8872b1c8da54494721fad8907ee41f7 (patch) | |
tree | 439d7cb75466978be936250c65a27ff05e82d9bc /include | |
parent | 857d64485e7c920364688a8a6dd0ffe5774327b6 (diff) | |
parent | 8858e8d98d5457ba23bcd0d99ce23e272b8b09a1 (diff) | |
download | linux-3ad11d7ac8872b1c8da54494721fad8907ee41f7.tar.xz |
Merge tag 'block-5.10-2020-10-12' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- Series of merge handling cleanups (Baolin, Christoph)
- Series of blk-throttle fixes and cleanups (Baolin)
- Series cleaning up BDI, seperating the block device from the
backing_dev_info (Christoph)
- Removal of bdget() as a generic API (Christoph)
- Removal of blkdev_get() as a generic API (Christoph)
- Cleanup of is-partition checks (Christoph)
- Series reworking disk revalidation (Christoph)
- Series cleaning up bio flags (Christoph)
- bio crypt fixes (Eric)
- IO stats inflight tweak (Gabriel)
- blk-mq tags fixes (Hannes)
- Buffer invalidation fixes (Jan)
- Allow soft limits for zone append (Johannes)
- Shared tag set improvements (John, Kashyap)
- Allow IOPRIO_CLASS_RT for CAP_SYS_NICE (Khazhismel)
- DM no-wait support (Mike, Konstantin)
- Request allocation improvements (Ming)
- Allow md/dm/bcache to use IO stat helpers (Song)
- Series improving blk-iocost (Tejun)
- Various cleanups (Geert, Damien, Danny, Julia, Tetsuo, Tian, Wang,
Xianting, Yang, Yufen, yangerkun)
* tag 'block-5.10-2020-10-12' of git://git.kernel.dk/linux-block: (191 commits)
block: fix uapi blkzoned.h comments
blk-mq: move cancel of hctx->run_work to the front of blk_exit_queue
blk-mq: get rid of the dead flush handle code path
block: get rid of unnecessary local variable
block: fix comment and add lockdep assert
blk-mq: use helper function to test hw stopped
block: use helper function to test queue register
block: remove redundant mq check
block: invoke blk_mq_exit_sched no matter whether have .exit_sched
percpu_ref: don't refer to ref->data if it isn't allocated
block: ratelimit handle_bad_sector() message
blk-throttle: Re-use the throtl_set_slice_end()
blk-throttle: Open code __throtl_de/enqueue_tg()
blk-throttle: Move service tree validation out of the throtl_rb_first()
blk-throttle: Move the list operation after list validation
blk-throttle: Fix IO hang for a corner case
blk-throttle: Avoid tracking latency if low limit is invalid
blk-throttle: Avoid getting the current time if tg->last_finish_time is 0
blk-throttle: Remove a meaningless parameter for throtl_downgrade_state()
block: Remove redundant 'return' statement
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/backing-dev.h | 78 | ||||
-rw-r--r-- | include/linux/blk-crypto.h | 20 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 15 | ||||
-rw-r--r-- | include/linux/blk_types.h | 7 | ||||
-rw-r--r-- | include/linux/blkdev.h | 84 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 6 | ||||
-rw-r--r-- | include/linux/fs.h | 2 | ||||
-rw-r--r-- | include/linux/genhd.h | 15 | ||||
-rw-r--r-- | include/linux/ide.h | 2 | ||||
-rw-r--r-- | include/linux/percpu-refcount.h | 52 | ||||
-rw-r--r-- | include/linux/suspend.h | 4 | ||||
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | include/trace/events/iocost.h | 67 | ||||
-rw-r--r-- | include/uapi/linux/blkzoned.h | 15 | ||||
-rw-r--r-- | include/uapi/linux/capability.h | 2 |
15 files changed, 199 insertions, 173 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0b06b2d26c9a..44df4fcef65c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -110,33 +110,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); /* * Flags in backing_dev_info::capability * - * The first three flags control whether dirty pages will contribute to the - * VM's accounting and whether writepages() should be called for dirty pages - * (something that would not, for example, be appropriate for ramfs) - * - * WARNING: these flags are closely related and should not normally be - * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these - * three flags into a single convenience macro. - * - * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting - * BDI_CAP_NO_WRITEBACK: Don't write pages back - * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages - * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. - * - * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. - * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be - * inefficient. + * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages + * should contribute to accounting + * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ -#define BDI_CAP_NO_ACCT_DIRTY 0x00000001 -#define BDI_CAP_NO_WRITEBACK 0x00000002 -#define BDI_CAP_NO_ACCT_WB 0x00000004 -#define BDI_CAP_STABLE_WRITES 0x00000008 -#define BDI_CAP_STRICTLIMIT 0x00000010 -#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 -#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 - -#define BDI_CAP_NO_ACCT_AND_WRITEBACK \ - (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) +#define BDI_CAP_WRITEBACK (1 << 0) +#define BDI_CAP_WRITEBACK_ACCT (1 << 1) +#define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; @@ -175,41 +156,9 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(int sync, long timeout); -static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) -{ - return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; -} - -static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) -{ - return bdi->capabilities & BDI_CAP_STABLE_WRITES; -} - -static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) -{ - return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); -} - -static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) -{ - return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); -} - -static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) -{ - /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ - return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | - BDI_CAP_NO_WRITEBACK)); -} - -static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) -{ - return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host)); -} - -static inline bool mapping_cap_account_dirty(struct address_space *mapping) +static inline bool mapping_can_writeback(struct address_space *mapping) { - return bdi_cap_account_dirty(inode_to_bdi(mapping->host)); + return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } static inline int bdi_sched_wait(void *word) @@ -233,9 +182,9 @@ int inode_congested(struct inode *inode, int cong_bits); * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * - * cgroup writeback requires support from both the bdi and filesystem. - * Also, both memcg and iocg have to be on the default hierarchy. Test - * whether all conditions are met. + * Cgroup writeback requires support from the filesystem. Also, both memcg and + * iocg have to be on the default hierarchy. Test whether all conditions are + * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. @@ -246,8 +195,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && - bdi_cap_account_dirty(bdi) && - (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index e82342907f2b..69b24fe92cbf 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -112,12 +112,24 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ -void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); -static inline void bio_crypt_clone(struct bio *dst, struct bio *src, - gfp_t gfp_mask) +int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); +/** + * bio_crypt_clone - clone bio encryption context + * @dst: destination bio + * @src: source bio + * @gfp_mask: memory allocation flags + * + * If @src has an encryption context, clone it to @dst. + * + * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if + * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. + */ +static inline int bio_crypt_clone(struct bio *dst, struct bio *src, + gfp_t gfp_mask) { if (bio_has_crypt_ctx(src)) - __bio_crypt_clone(dst, src, gfp_mask); + return __bio_crypt_clone(dst, src, gfp_mask); + return 0; } #endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9d2d5ad367a4..b23eeca4d677 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -139,6 +139,10 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; + /** + * @elevator_queued: Number of queued requests on hctx. + */ + atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; @@ -231,6 +235,9 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. + * @__bitmap_tags: A shared tags sbitmap, used over all hctx's + * @__breserved_tags: + * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @tag_list_lock: Serializes tag_list accesses. @@ -249,7 +256,10 @@ struct blk_mq_tag_set { unsigned int timeout; unsigned int flags; void *driver_data; + atomic_t active_queues_shared_sbitmap; + struct sbitmap_queue __bitmap_tags; + struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; struct mutex tag_list_lock; @@ -378,12 +388,13 @@ struct blk_mq_ops { enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, - BLK_MQ_F_TAG_SHARED = 1 << 1, + BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, + BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, @@ -489,8 +500,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b3fc5d3dd8ea..7d7c13238fdb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,7 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { - dev_t bd_dev; /* not a kdev_t - it's a search key */ + dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; @@ -37,7 +37,8 @@ struct block_device { struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; - int bd_invalidated; + + spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; @@ -255,8 +256,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_USER_MAPPED, /* contains user pages */ - BIO_NULL_MAPPED, /* contains invalid user pages */ BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 868e11face00..1d99bf70a90a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> #include <linux/blkzoned.h> +#include <linux/pm.h> struct module; struct scsi_ioctl_command; @@ -398,6 +399,8 @@ struct request_queue { struct request *last_merge; struct elevator_queue *elevator; + struct percpu_ref q_usage_counter; + struct blk_queue_stats *stats; struct rq_qos *rq_qos; @@ -460,7 +463,7 @@ struct request_queue { #ifdef CONFIG_PM struct device *dev; - int rpm_status; + enum rpm_status rpm_status; unsigned int nr_pending; #endif @@ -486,6 +489,8 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; + atomic_t nr_active_requests_shared_sbitmap; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); @@ -568,7 +573,6 @@ struct request_queue { * percpu_ref_kill() and percpu_ref_reinit(). */ struct mutex mq_freeze_lock; - struct percpu_ref q_usage_counter; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; @@ -605,6 +609,7 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ +#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 17 /* Write back caching */ #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ @@ -617,9 +622,12 @@ struct request_queue { #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ +#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ +#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_SAME_COMP)) + (1 << QUEUE_FLAG_SAME_COMP) | \ + (1 << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); @@ -633,6 +641,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_stable_writes(q) \ + test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) @@ -659,6 +669,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) +#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -1061,11 +1072,17 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, static inline unsigned int blk_max_size_offset(struct request_queue *q, sector_t offset) { - if (!q->limits.chunk_sectors) + unsigned int chunk_sectors = q->limits.chunk_sectors; + + if (!chunk_sectors) return q->limits.max_sectors; - return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors - - (offset & (q->limits.chunk_sectors - 1)))); + if (likely(is_power_of_2(chunk_sectors))) + chunk_sectors -= offset & (chunk_sectors - 1); + else + chunk_sectors -= sector_div(offset, chunk_sectors); + + return min(q->limits.max_sectors, chunk_sectors); } static inline unsigned int blk_rq_get_max_sectors(struct request *rq, @@ -1132,6 +1149,7 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q, extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); +void blk_queue_update_readahead(struct request_queue *q); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); @@ -1341,6 +1359,11 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, extern int blk_verify_command(unsigned char *cmd, fmode_t mode); +static inline bool bdev_is_partition(struct block_device *bdev) +{ + return bdev->bd_partno; +} + enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, @@ -1386,7 +1409,10 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) { - return q->limits.max_zone_append_sectors; + + const struct queue_limits *l = &q->limits; + + return min(l->max_zone_append_sectors, l->max_sectors); } static inline unsigned queue_logical_block_size(const struct request_queue *q) @@ -1457,10 +1483,9 @@ static inline int bdev_alignment_offset(struct block_device *bdev) if (q->limits.misaligned) return -1; - - if (bdev != bdev->bd_contains) - return bdev->bd_part->alignment_offset; - + if (bdev_is_partition(bdev)) + return queue_limit_alignment_offset(&q->limits, + bdev->bd_part->start_sect); return q->limits.alignment_offset; } @@ -1499,9 +1524,9 @@ static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (bdev != bdev->bd_contains) - return bdev->bd_part->discard_alignment; - + if (bdev_is_partition(bdev)) + return queue_limit_discard_alignment(&q->limits, + bdev->bd_part->start_sect); return q->limits.discard_alignment; } @@ -1644,10 +1669,6 @@ extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); -extern bool blk_integrity_merge_rq(struct request_queue *, struct request *, - struct request *); -extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, - struct bio *); static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { @@ -1775,18 +1796,6 @@ static inline unsigned short queue_max_integrity_segments(const struct request_q { return 0; } -static inline bool blk_integrity_merge_rq(struct request_queue *rq, - struct request *r1, - struct request *r2) -{ - return true; -} -static inline bool blk_integrity_merge_bio(struct request_queue *rq, - struct request *r, - struct bio *b) -{ - return true; -} static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) @@ -1932,6 +1941,11 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); +unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, + struct bio *bio); +void part_end_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time); + /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for @@ -1969,7 +1983,6 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #define BLKDEV_MAJOR_MAX 0 #endif -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); @@ -1980,17 +1993,24 @@ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void blkdev_put(struct block_device *bdev, fmode_t mode); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdget(dev_t); +struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } +static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + return 0; +} static inline int sync_blockdev(struct block_device *bdev) { return 0; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 93096e524e43..d6f8d4ba8d48 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -252,6 +252,12 @@ struct target_type { #define DM_TARGET_ZONED_HM 0x00000040 #define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM) +/* + * A target handles REQ_NOWAIT + */ +#define DM_TARGET_NOWAIT 0x00000080 +#define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT) + struct dm_target { struct dm_table *table; struct target_type *type; diff --git a/include/linux/fs.h b/include/linux/fs.h index 34ad5fe166a1..0b1e2f1f388b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1377,7 +1377,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4ab853461dff..38f23d757013 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,8 +65,6 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; - sector_t alignment_offset; - unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; @@ -193,6 +191,8 @@ struct gendisk { void *private_data; int flags; + unsigned long state; +#define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir; @@ -315,9 +315,8 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -extern void set_capacity_revalidate_and_notify(struct gendisk *disk, - sector_t size, bool revalidate); -extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool update_bdev); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; @@ -372,10 +371,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); int register_blkdev(unsigned int major, const char *name); void unregister_blkdev(unsigned int major, const char *name); -int revalidate_disk(struct gendisk *disk); -int check_disk_change(struct block_device *bdev); +void revalidate_disk_size(struct gendisk *disk, bool verbose); +bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_size(struct block_device *bdev, loff_t size); +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); diff --git a/include/linux/ide.h b/include/linux/ide.h index a254841bd315..62653769509f 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -490,8 +490,6 @@ enum { IDE_DFLAG_NOPROBE = BIT(9), /* need to do check_media_change() */ IDE_DFLAG_REMOVABLE = BIT(10), - /* needed for removable devices */ - IDE_DFLAG_ATTACH = BIT(11), IDE_DFLAG_FORCED_GEOM = BIT(12), /* disallow setting unmask bit */ IDE_DFLAG_NO_UNMASK = BIT(13), diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 87d8a38bdea1..16c35a728b4c 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -92,18 +92,30 @@ enum { PERCPU_REF_ALLOW_REINIT = 1 << 2, }; -struct percpu_ref { +struct percpu_ref_data { atomic_long_t count; - /* - * The low bit of the pointer indicates whether the ref is in percpu - * mode; if set, then get/put will manipulate the atomic_t. - */ - unsigned long percpu_count_ptr; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; bool allow_reinit:1; struct rcu_head rcu; + struct percpu_ref *ref; +}; + +struct percpu_ref { + /* + * The low bit of the pointer indicates whether the ref is in percpu + * mode; if set, then get/put will manipulate the atomic_t. + */ + unsigned long percpu_count_ptr; + + /* + * 'percpu_ref' is often embedded into user structure, and only + * 'percpu_count_ptr' is required in fast path, move other fields + * into 'percpu_ref_data', so we can reduce memory footprint in + * fast path. + */ + struct percpu_ref_data *data; }; int __must_check percpu_ref_init(struct percpu_ref *ref, @@ -118,6 +130,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); +bool percpu_ref_is_zero(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref @@ -191,7 +204,7 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) if (__ref_is_percpu(ref, &percpu_count)) this_cpu_add(*percpu_count, nr); else - atomic_long_add(nr, &ref->count); + atomic_long_add(nr, &ref->data->count); rcu_read_unlock(); } @@ -231,7 +244,7 @@ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, this_cpu_add(*percpu_count, nr); ret = true; } else { - ret = atomic_long_add_unless(&ref->count, nr, 0); + ret = atomic_long_add_unless(&ref->data->count, nr, 0); } rcu_read_unlock(); @@ -279,7 +292,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) this_cpu_inc(*percpu_count); ret = true; } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { - ret = atomic_long_inc_not_zero(&ref->count); + ret = atomic_long_inc_not_zero(&ref->data->count); } rcu_read_unlock(); @@ -305,8 +318,8 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) if (__ref_is_percpu(ref, &percpu_count)) this_cpu_sub(*percpu_count, nr); - else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) - ref->release(ref); + else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) + ref->data->release(ref); rcu_read_unlock(); } @@ -339,21 +352,4 @@ static inline bool percpu_ref_is_dying(struct percpu_ref *ref) return ref->percpu_count_ptr & __PERCPU_REF_DEAD; } -/** - * percpu_ref_is_zero - test whether a percpu refcount reached zero - * @ref: percpu_ref to test - * - * Returns %true if @ref reached zero. - * - * This function is safe to call as long as @ref is between init and exit. - */ -static inline bool percpu_ref_is_zero(struct percpu_ref *ref) -{ - unsigned long __percpu *percpu_count; - - if (__ref_is_percpu(ref, &percpu_count)) - return false; - return !atomic_long_read(&ref->count); -} - #endif diff --git a/include/linux/suspend.h b/include/linux/suspend.h index cb9afad82a90..8af13ba60c7e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -473,9 +473,9 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV -int is_hibernate_resume_dev(const struct inode *); +int is_hibernate_resume_dev(dev_t dev); #else -static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; } +static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } #endif /* Hibernation and suspend events */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 661046994db4..4340a7b6e7a1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -467,7 +467,8 @@ extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); -extern int swap_type_of(dev_t, sector_t, struct block_device **); +int swap_type_of(dev_t device, sector_t offset); +int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index c2f580fd371b..0b6869980ba2 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate, __field(u64, vrate) __field(u64, last_period) __field(u64, cur_period) - __field(u64, last_vtime) __field(u64, vtime) __field(u32, weight) __field(u32, inuse) @@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate, __entry->vrate = now->vrate; __entry->last_period = last_period; __entry->cur_period = cur_period; - __entry->last_vtime = iocg->last_vtime; __entry->vtime = vtime; __entry->weight = iocg->weight; __entry->inuse = iocg->inuse; @@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate, ), TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " - "period=%llu->%llu vtime=%llu->%llu " + "period=%llu->%llu vtime=%llu " "weight=%u/%u hweight=%llu/%llu", __get_str(devname), __get_str(cgroup), __entry->now, __entry->vnow, __entry->vrate, __entry->last_period, __entry->cur_period, - __entry->last_vtime, __entry->vtime, - __entry->inuse, __entry->weight, + __entry->vtime, __entry->inuse, __entry->weight, __entry->hweight_inuse, __entry->hweight_active ) ); @@ -98,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update, ) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -108,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -118,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -131,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, TRACE_EVENT(iocost_ioc_vrate_adj, TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm, - u32 rq_wait_pct, int nr_lagging, int nr_shortages, - int nr_surpluses), + u32 rq_wait_pct, int nr_lagging, int nr_shortages), - TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, - nr_surpluses), + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages), TP_STRUCT__entry ( __string(devname, ioc_name(ioc)) @@ -147,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __field(u32, rq_wait_pct) __field(int, nr_lagging) __field(int, nr_shortages) - __field(int, nr_surpluses) ), TP_fast_assign( @@ -160,15 +154,54 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __entry->rq_wait_pct = rq_wait_pct; __entry->nr_lagging = nr_lagging; __entry->nr_shortages = nr_shortages; - __entry->nr_surpluses = nr_surpluses; ), - TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d", __get_str(devname), __entry->old_vrate, __entry->new_vrate, __entry->busy_level, __entry->read_missed_ppm, __entry->write_missed_ppm, - __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, - __entry->nr_surpluses + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages + ) +); + +TRACE_EVENT(iocost_iocg_forgive_debt, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 usage_pct, u64 old_debt, u64 new_debt, + u64 old_delay, u64 new_delay), + + TP_ARGS(iocg, path, now, usage_pct, + old_debt, new_debt, old_delay, new_delay), + + TP_STRUCT__entry ( + __string(devname, ioc_name(iocg->ioc)) + __string(cgroup, path) + __field(u64, now) + __field(u64, vnow) + __field(u32, usage_pct) + __field(u64, old_debt) + __field(u64, new_debt) + __field(u64, old_delay) + __field(u64, new_delay) + ), + + TP_fast_assign( + __assign_str(devname, ioc_name(iocg->ioc)); + __assign_str(cgroup, path); + __entry->now = now->now; + __entry->vnow = now->vnow; + __entry->usage_pct = usage_pct; + __entry->old_debt = old_debt; + __entry->new_debt = new_debt; + __entry->old_delay = old_delay; + __entry->new_delay = new_delay; + ), + + TP_printk("[%s:%s] now=%llu:%llu usage=%u debt=%llu->%llu delay=%llu->%llu", + __get_str(devname), __get_str(cgroup), + __entry->now, __entry->vnow, __entry->usage_pct, + __entry->old_debt, __entry->new_debt, + __entry->old_delay, __entry->new_delay ) ); diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 42c3366cc25f..656a326821a2 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -93,12 +93,15 @@ enum blk_zone_report_flags { * @non_seq: Flag indicating that the zone is using non-sequential resources * (for host-aware zoned block devices only). * @reset: Flag indicating that a zone reset is recommended. - * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size. + * @resv: Padding for 8B alignment. + * @capacity: Zone usable capacity in 512 B sector units + * @reserved: Padding to 64 B to match the ZBC, ZAC and ZNS defined zone + * descriptor size. * - * start, len and wp use the regular 512 B sector unit, regardless of the - * device logical block size. The overall structure size is 64 B to match the - * ZBC/ZAC defined zone descriptor and allow support for future additional - * zone information. + * start, len, capacity and wp use the regular 512 B sector unit, regardless + * of the device logical block size. The overall structure size is 64 B to + * match the ZBC, ZAC and ZNS defined zone descriptor and allow support for + * future additional zone information. */ struct blk_zone { __u64 start; /* Zone start sector */ @@ -118,7 +121,7 @@ struct blk_zone { * * @sector: starting sector of report * @nr_zones: IN maximum / OUT actual - * @reserved: padding to 16 byte alignment + * @flags: one or more flags as defined by enum blk_zone_report_flags. * @zones: Space to hold @nr_zones @zones entries on reply. * * The array of at most @nr_zones must follow this structure in memory. diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 395dd0df8d08..c6ca33034147 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { processes and setting the scheduling algorithm used by another process. */ /* Allow setting cpu affinity on other processes */ +/* Allow setting realtime ioprio class */ +/* Allow setting ioprio class on other processes */ #define CAP_SYS_NICE 23 |