summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-16 07:20:52 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-16 07:20:52 +0300
commit9637d517347e80ee2fe1c5d8ce45ba1b88d8b5cd (patch)
tree3cee2a1d8b3c6ea466924517307a1f98ada1e92f /include
parent273cbf61c3ddee9574ef1f4959b9bc6db5b24271 (diff)
parent787c79d6393fc028887cc1b6066915f0b094e92f (diff)
downloadlinux-9637d517347e80ee2fe1c5d8ce45ba1b88d8b5cd.tar.xz
Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe: "A later pull request with some followup items. I had some vacation coming up to the merge window, so certain things items were delayed a bit. This pull request also contains fixes that came in within the last few days of the merge window, which I didn't want to push right before sending you a pull request. This contains: - NVMe pull request, mostly fixes, but also a few minor items on the feature side that were timing constrained (Christoph et al) - Report zones fixes (Damien) - Removal of dead code (Damien) - Turn on cgroup psi memstall (Josef) - block cgroup MAINTAINERS entry (Konstantin) - Flush init fix (Josef) - blk-throttle low iops timing fix (Konstantin) - nbd resize fixes (Mike) - nbd 0 blocksize crash fix (Xiubo) - block integrity error leak fix (Wenwen) - blk-cgroup writeback and priority inheritance fixes (Tejun)" * tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits) MAINTAINERS: add entry for block io cgroup null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED block: Limit zone array allocation size sd_zbc: Fix report zones buffer allocation block: Kill gfp_t argument of blkdev_report_zones() block: Allow mapping of vmalloc-ed buffers block/bio-integrity: fix a memory leak bug nvme: fix NULL deref for fabrics options nbd: add netlink reconfigure resize support nbd: fix crash when the blksize is zero block: Disable write plugging for zoned block devices block: Fix elevator name declaration block: Remove unused definitions nvme: fix regression upon hot device removal and insertion blk-throttle: fix zero wait time for iops throttled group block: Fix potential overflow in blk_report_zones() blkcg: implement REQ_CGROUP_PUNT blkcg, writeback: Implement wbc_blkcg_css() blkcg, writeback: Add wbc->no_cgroup_owner blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner() ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/blk-cgroup.h16
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/blkdev.h14
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/device-mapper.h3
-rw-r--r--include/linux/elevator.h11
-rw-r--r--include/linux/nvme.h12
-rw-r--r--include/linux/writeback.h41
9 files changed, 81 insertions, 28 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
- struct rcu_head rcu_head;
+ spinlock_t async_bio_lock;
+ struct bio_list async_bios;
+ struct work_struct async_bio_work;
atomic_t use_delay;
atomic64_t delay_nsec;
atomic64_t delay_start;
u64 last_delay;
int last_use;
+
+ struct rcu_head rcu_head;
};
typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
struct bio *bio) { return false; }
#endif
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_CGROUP_PUNT)
+ return __blkcg_punt_bio_submit(bio);
+ else
+ return false;
+}
static inline void blkcg_bio_issue_init(struct bio *bio)
{
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
__REQ_NOWAIT, /* Don't wait if request will block */
+ /*
+ * When a shared kthread needs to issue a bio for a cgroup, doing
+ * so synchronously can lead to priority inversions as the kthread
+ * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
+ * submit_bio() punt the actual issuing to a dedicated per-blkcg
+ * work item to avoid such priority inversions.
+ */
+ __REQ_CGROUP_PUNT,
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
+
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c482371c8b3..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,10 +344,15 @@ struct queue_limits {
#ifdef CONFIG_BLK_DEV_ZONED
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES 8192U
+
extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_report_zones(struct block_device *bdev,
sector_t sector, struct blk_zone *zones,
- unsigned int *nr_zones, gfp_t gfp_mask);
+ unsigned int *nr_zones);
extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
sector_t nr_sectors, gfp_t gfp_mask);
extern int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -681,7 +686,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
}
}
-static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
+static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
{
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
}
@@ -1418,7 +1423,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
return false;
}
-static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
+static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1673,8 +1678,7 @@ struct block_device_operations {
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask);
+ struct blk_zone *zones, unsigned int *nr_zones);
struct module *owner;
const struct pr_ops *pr_ops;
};
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2af9b1b419f1..f6b048902d6c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
struct cgroup_subsys_state;
struct cgroup;
+static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..3b470cb03b66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
struct blk_zone *zones,
- unsigned int *nr_zones,
- gfp_t gfp_mask);
+ unsigned int *nr_zones);
/*
* These iteration functions are typically used to check (and combine)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 169bb2e02516..17cd0078377c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -75,7 +75,7 @@ struct elevator_type
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs;
- char elevator_name[ELV_NAME_MAX];
+ const char *elevator_name;
const char *elevator_alias;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
@@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define ELEVATOR_INSERT_FLUSH 5
#define ELEVATOR_INSERT_SORT_MERGE 6
-/*
- * return values from elevator_may_queue_fn
- */
-enum {
- ELV_MQUEUE_MAY,
- ELV_MQUEUE_NO,
- ELV_MQUEUE_MUST,
-};
-
#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq))
#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d98b2d8baf4e..01aa6a6c241d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -315,7 +315,7 @@ struct nvme_id_ns {
__u8 nmic;
__u8 rescap;
__u8 fpi;
- __u8 rsvd33;
+ __u8 dlfeat;
__le16 nawun;
__le16 nawupf;
__le16 nacwu;
@@ -324,11 +324,17 @@ struct nvme_id_ns {
__le16 nabspf;
__le16 noiob;
__u8 nvmcap[16];
- __u8 rsvd64[28];
+ __le16 npwg;
+ __le16 npwa;
+ __le16 npdg;
+ __le16 npda;
+ __le16 nows;
+ __u8 rsvd74[18];
__le32 anagrpid;
__u8 rsvd96[3];
__u8 nsattr;
- __u8 rsvd100[4];
+ __le16 nvmsetid;
+ __le16 endgid;
__u8 nguid[16];
__u8 eui64[8];
struct nvme_lbaf lbaf[16];
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
+#include <linux/blk-cgroup.h>
struct bio;
@@ -68,6 +69,17 @@ struct writeback_control {
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+
+ /*
+ * When writeback IOs are bounced through async layers, only the
+ * initial synchronous phase should be accounted towards inode
+ * cgroup ownership arbitration to avoid confusion. Later stages
+ * can set the following flag to disable the accounting.
+ */
+ unsigned no_cgroup_owner:1;
+
+ unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
@@ -84,12 +96,27 @@ struct writeback_control {
static inline int wbc_to_write_flags(struct writeback_control *wbc)
{
+ int flags = 0;
+
+ if (wbc->punt_to_cgroup)
+ flags = REQ_CGROUP_PUNT;
+
if (wbc->sync_mode == WB_SYNC_ALL)
- return REQ_SYNC;
+ flags |= REQ_SYNC;
else if (wbc->for_kupdate || wbc->for_background)
- return REQ_BACKGROUND;
+ flags |= REQ_BACKGROUND;
- return 0;
+ return flags;
+}
+
+static inline struct cgroup_subsys_state *
+wbc_blkcg_css(struct writeback_control *wbc)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (wbc->wb)
+ return wbc->wb->blkcg_css;
+#endif
+ return blkcg_root_css;
}
/*
@@ -188,8 +215,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
__releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
- size_t bytes);
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+ size_t bytes);
void cgroup_writeback_umount(void);
/**
@@ -291,8 +318,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}
-static inline void wbc_account_io(struct writeback_control *wbc,
- struct page *page, size_t bytes)
+static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+ struct page *page, size_t bytes)
{
}