summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h5
-rw-r--r--include/linux/blk_types.h31
-rw-r--r--include/linux/blkdev.h30
-rw-r--r--include/linux/drbd.h10
-rw-r--r--include/linux/drbd_genl.h7
-rw-r--r--include/linux/drbd_limits.h15
-rw-r--r--include/linux/fs.h18
-rw-r--r--include/linux/genhd.h8
-rw-r--r--include/linux/lightnvm.h34
-rw-r--r--include/linux/nvme-rdma.h71
-rw-r--r--include/linux/nvme.h406
12 files changed, 537 insertions, 100 deletions
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b7e1a00810f2..583c10810e32 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -663,8 +663,6 @@ static inline void bio_inc_remaining(struct bio *bio)
* and the bvec_slabs[].
*/
#define BIO_POOL_SIZE 2
-#define BIOVEC_NR_POOLS 6
-#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
struct bio_set {
struct kmem_cache *bio_slab;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2498fdf3a503..e43bbffb5b7a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int,
unsigned int, unsigned int);
typedef void (exit_request_fn)(void *, struct request *, unsigned int,
unsigned int);
+typedef int (reinit_request_fn)(void *, struct request *);
typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
bool);
@@ -145,6 +146,7 @@ struct blk_mq_ops {
*/
init_request_fn *init_request;
exit_request_fn *exit_request;
+ reinit_request_fn *reinit_request;
};
enum {
@@ -196,6 +198,8 @@ enum {
struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
unsigned int flags);
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op,
+ unsigned int flags, unsigned int hctx_idx);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
@@ -243,6 +247,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_mq_freeze_queue_start(struct request_queue *q);
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b588e968dc01..f254eb264924 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -26,11 +26,11 @@ typedef void (bio_destructor_t) (struct bio *);
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
- unsigned int bi_flags; /* status, command, etc */
int bi_error;
unsigned int bi_rw; /* bottom bits req flags,
* top bits REQ_OP
*/
+ unsigned short bi_flags; /* status, command, etc */
unsigned short bi_ioprio;
struct bvec_iter bi_iter;
@@ -114,19 +114,25 @@ struct bio {
/*
* Flags starting here get preserved by bio_reset() - this includes
- * BIO_POOL_IDX()
+ * BVEC_POOL_IDX()
*/
-#define BIO_RESET_BITS 13
-#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */
+#define BIO_RESET_BITS 10
/*
- * top 4 bits of bio flags indicate the pool this bio came from
+ * We support 6 different bvec pools, the last one is magic in that it
+ * is backed by a mempool.
*/
-#define BIO_POOL_BITS (4)
-#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1)
-#define BIO_POOL_OFFSET (32 - BIO_POOL_BITS)
-#define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET)
-#define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET)
+#define BVEC_POOL_NR 6
+#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
+
+/*
+ * Top 4 bits of bio flags indicate the pool the bvecs came from. We add
+ * 1 to the actual index so that 0 indicates that there are no bvecs to be
+ * freed.
+ */
+#define BVEC_POOL_BITS (4)
+#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
+#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
#endif /* CONFIG_BLOCK */
@@ -143,7 +149,6 @@ enum rq_flag_bits {
__REQ_SYNC, /* request is sync (sync write or read) */
__REQ_META, /* metadata io request */
__REQ_PRIO, /* boost priority in cfq */
- __REQ_SECURE, /* secure discard (used with REQ_OP_DISCARD) */
__REQ_NOIDLE, /* don't anticipate more IO after this one */
__REQ_INTEGRITY, /* I/O includes block integrity payload */
@@ -192,7 +197,7 @@ enum rq_flag_bits {
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_COMMON_MASK \
(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
- REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
+ REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
#define REQ_CLONE_MASK REQ_COMMON_MASK
/* This mask is used for both bio and request merge checking */
@@ -219,7 +224,6 @@ enum rq_flag_bits {
#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
-#define REQ_SECURE (1ULL << __REQ_SECURE)
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_HASHED (1ULL << __REQ_HASHED)
#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
@@ -228,6 +232,7 @@ enum req_op {
REQ_OP_READ,
REQ_OP_WRITE,
REQ_OP_DISCARD, /* request to discard sectors */
+ REQ_OP_SECURE_ERASE, /* request to securely erase sectors */
REQ_OP_WRITE_SAME, /* write same block many times */
REQ_OP_FLUSH, /* request for cache flush */
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 48f05d768a53..c96db9c22d10 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -496,7 +496,7 @@ struct request_queue {
#define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */
#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */
-#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
+#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */
#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
@@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
#define blk_queue_stackable(q) \
test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
-#define blk_queue_secdiscard(q) (blk_queue_discard(q) && \
- test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_secure_erase(q) \
+ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
#define blk_noretry_request(rq) \
@@ -676,21 +676,6 @@ static inline bool rq_mergeable(struct request *rq)
return true;
}
-static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1,
- unsigned int flags2, unsigned int op2)
-{
- if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD))
- return false;
-
- if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE))
- return false;
-
- if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME))
- return false;
-
- return true;
-}
-
static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
{
if (bio_data(a) == bio_data(b))
@@ -804,8 +789,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
extern void blk_put_request(struct request *);
extern void __blk_put_request(struct request_queue *, struct request *);
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
-extern struct request *blk_make_request(struct request_queue *, struct bio *,
- gfp_t);
extern void blk_rq_set_block_pc(struct request *);
extern void blk_requeue_request(struct request_queue *, struct request *);
extern void blk_add_request_payload(struct request *rq, struct page *page,
@@ -818,6 +801,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
extern void blk_rq_unprep_clone(struct request *rq);
extern int blk_insert_cloned_request(struct request_queue *q,
struct request *rq);
+extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
extern void blk_delay_queue(struct request_queue *, unsigned long);
extern void blk_queue_split(struct request_queue *, struct bio **,
struct bio_set *);
@@ -1154,13 +1138,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
return bqt->tag_index[tag];
}
-#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */
+
+#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */
+#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */
extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, int op_flags,
+ sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop);
extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct page *page);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index d6b3c9943a2c..002611c85318 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -51,7 +51,7 @@
#endif
extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.6"
+#define REL_VERSION "8.4.7"
#define API_VERSION 1
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 101
@@ -370,6 +370,14 @@ enum drbd_notification_type {
NOTIFY_FLAGS = NOTIFY_CONTINUES,
};
+enum drbd_peer_state {
+ P_INCONSISTENT = 3,
+ P_OUTDATED = 4,
+ P_DOWN = 5,
+ P_PRIMARY = 6,
+ P_FENCING = 7,
+};
+
#define UUID_JUST_CREATED ((__u64)4)
enum write_ordering_e {
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 2d0e5ad5de9d..c934d3a96b5e 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -123,15 +123,16 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
__u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
+ __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
+ __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
+ __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF)
__flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
__flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
__flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
__flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
- __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
- __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
- /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
__flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
+ __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED)
)
GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 8ac8c5d9a3ad..ddac68422a96 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -126,8 +126,7 @@
#define DRBD_RESYNC_RATE_DEF 250
#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
- /* less than 7 would hit performance unnecessarily. */
-#define DRBD_AL_EXTENTS_MIN 7
+#define DRBD_AL_EXTENTS_MIN 67
/* we use u16 as "slot number", (u16)~0 is "FREE".
* If you use >= 292 kB on-disk ring buffer,
* this is the maximum you can use: */
@@ -210,6 +209,12 @@
#define DRBD_MD_FLUSHES_DEF 1
#define DRBD_TCP_CORK_DEF 1
#define DRBD_AL_UPDATES_DEF 1
+/* We used to ignore the discard_zeroes_data setting.
+ * To not change established (and expected) behaviour,
+ * by default assume that, for discard_zeroes_data=0,
+ * we can make that an effective discard_zeroes_data=1,
+ * if we only explicitly zero-out unaligned partial chunks. */
+#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1
#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0
#define DRBD_ALWAYS_ASBP_DEF 0
@@ -230,4 +235,10 @@
#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
+
+#define DRBD_RS_DISCARD_GRANULARITY_MIN 0
+#define DRBD_RS_DISCARD_GRANULARITY_MAX (1<<20) /* 1MiByte */
+#define DRBD_RS_DISCARD_GRANULARITY_DEF 0 /* disabled by default */
+#define DRBD_RS_DISCARD_GRANULARITY_SCALE '1' /* bytes */
+
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 183024525d40..dc488662ce0b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -178,9 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
* READ_SYNC A synchronous read. Device is not plugged, caller can
* immediately wait on this read without caring about
* unplugging.
- * READA Used for read-ahead operations. Lower priority, and the
- * block layer could (in theory) choose to ignore this
- * request if it runs into resource problems.
* WRITE A normal async write. Device will be plugged.
* WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
* the hint that someone will be waiting on this IO
@@ -195,11 +192,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
*
*/
#define RW_MASK REQ_OP_WRITE
-#define RWA_MASK REQ_RAHEAD
#define READ REQ_OP_READ
-#define WRITE RW_MASK
-#define READA RWA_MASK
+#define WRITE REQ_OP_WRITE
#define READ_SYNC REQ_SYNC
#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE)
@@ -2471,17 +2466,6 @@ static inline bool op_is_write(unsigned int op)
}
/*
- * return READ, READA, or WRITE
- */
-static inline int bio_rw(struct bio *bio)
-{
- if (op_is_write(bio_op(bio)))
- return WRITE;
-
- return bio->bi_rw & RWA_MASK;
-}
-
-/*
* return data direction, READ or WRITE
*/
static inline int bio_data_dir(struct bio *bio)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 359a8e4bd44d..1dbf52f9c24b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -205,7 +205,6 @@ struct gendisk {
void *private_data;
int flags;
- struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
@@ -414,7 +413,12 @@ static inline void free_part_info(struct hd_struct *part)
extern void part_round_stats(int cpu, struct hd_struct *part);
/* block/genhd.c */
-extern void add_disk(struct gendisk *disk);
+extern void device_add_disk(struct device *parent, struct gendisk *disk);
+static inline void add_disk(struct gendisk *disk)
+{
+ device_add_disk(NULL, disk);
+}
+
extern void del_gendisk(struct gendisk *gp);
extern struct gendisk *get_gendisk(dev_t dev, int *partno);
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ef2c7d2e76c4..ba78b8306674 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -1,7 +1,9 @@
#ifndef NVM_H
#define NVM_H
+#include <linux/blkdev.h>
#include <linux/types.h>
+#include <uapi/linux/lightnvm.h>
enum {
NVM_IO_OK = 0,
@@ -269,24 +271,15 @@ struct nvm_lun {
int lun_id;
int chnl_id;
- /* It is up to the target to mark blocks as closed. If the target does
- * not do it, all blocks are marked as open, and nr_open_blocks
- * represents the number of blocks in use
- */
- unsigned int nr_open_blocks; /* Number of used, writable blocks */
- unsigned int nr_closed_blocks; /* Number of used, read-only blocks */
- unsigned int nr_free_blocks; /* Number of unused blocks */
- unsigned int nr_bad_blocks; /* Number of bad blocks */
-
spinlock_t lock;
+ unsigned int nr_free_blocks; /* Number of unused blocks */
struct nvm_block *blocks;
};
enum {
NVM_BLK_ST_FREE = 0x1, /* Free block */
- NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */
- NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */
+ NVM_BLK_ST_TGT = 0x2, /* Block in use by target */
NVM_BLK_ST_BAD = 0x8, /* Bad block */
};
@@ -385,6 +378,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
{
struct ppa_addr l;
+ l.ppa = 0;
/*
* (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc.
*/
@@ -455,6 +449,8 @@ struct nvm_tgt_type {
struct list_head list;
};
+extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
+
extern int nvm_register_tgt_type(struct nvm_tgt_type *);
extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
@@ -463,6 +459,9 @@ extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
typedef int (nvmm_register_fn)(struct nvm_dev *);
typedef void (nvmm_unregister_fn)(struct nvm_dev *);
+
+typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
+typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *,
struct nvm_lun *, unsigned long);
typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *);
@@ -488,9 +487,10 @@ struct nvmm_type {
nvmm_register_fn *register_mgr;
nvmm_unregister_fn *unregister_mgr;
+ nvmm_create_tgt_fn *create_tgt;
+ nvmm_remove_tgt_fn *remove_tgt;
+
/* Block administration callbacks */
- nvmm_get_blk_fn *get_blk_unlocked;
- nvmm_put_blk_fn *put_blk_unlocked;
nvmm_get_blk_fn *get_blk;
nvmm_put_blk_fn *put_blk;
nvmm_open_blk_fn *open_blk;
@@ -520,10 +520,6 @@ struct nvmm_type {
extern int nvm_register_mgr(struct nvmm_type *);
extern void nvm_unregister_mgr(struct nvmm_type *);
-extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *,
- struct nvm_lun *, unsigned long);
-extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *);
-
extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
unsigned long);
extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
@@ -532,11 +528,13 @@ extern int nvm_register(struct request_queue *, char *,
struct nvm_dev_ops *);
extern void nvm_unregister(char *);
+void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
+
extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
- struct ppa_addr *, int, int);
+ const struct ppa_addr *, int, int);
extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
new file mode 100644
index 000000000000..bf240a3cbf99
--- /dev/null
+++ b/include/linux/nvme-rdma.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_NVME_RDMA_H
+#define _LINUX_NVME_RDMA_H
+
+enum nvme_rdma_cm_fmt {
+ NVME_RDMA_CM_FMT_1_0 = 0x0,
+};
+
+enum nvme_rdma_cm_status {
+ NVME_RDMA_CM_INVALID_LEN = 0x01,
+ NVME_RDMA_CM_INVALID_RECFMT = 0x02,
+ NVME_RDMA_CM_INVALID_QID = 0x03,
+ NVME_RDMA_CM_INVALID_HSQSIZE = 0x04,
+ NVME_RDMA_CM_INVALID_HRQSIZE = 0x05,
+ NVME_RDMA_CM_NO_RSC = 0x06,
+ NVME_RDMA_CM_INVALID_IRD = 0x07,
+ NVME_RDMA_CM_INVALID_ORD = 0x08,
+};
+
+/**
+ * struct nvme_rdma_cm_req - rdma connect request
+ *
+ * @recfmt: format of the RDMA Private Data
+ * @qid: queue Identifier for the Admin or I/O Queue
+ * @hrqsize: host receive queue size to be created
+ * @hsqsize: host send queue size to be created
+ */
+struct nvme_rdma_cm_req {
+ __le16 recfmt;
+ __le16 qid;
+ __le16 hrqsize;
+ __le16 hsqsize;
+ u8 rsvd[24];
+};
+
+/**
+ * struct nvme_rdma_cm_rep - rdma connect reply
+ *
+ * @recfmt: format of the RDMA Private Data
+ * @crqsize: controller receive queue size
+ */
+struct nvme_rdma_cm_rep {
+ __le16 recfmt;
+ __le16 crqsize;
+ u8 rsvd[28];
+};
+
+/**
+ * struct nvme_rdma_cm_rej - rdma connect reject
+ *
+ * @recfmt: format of the RDMA Private Data
+ * @fsts: error status for the associated connect request
+ */
+struct nvme_rdma_cm_rej {
+ __le16 recfmt;
+ __le16 sts;
+};
+
+#endif /* _LINUX_NVME_RDMA_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7d51b2904cb7..d8b37bab2887 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -16,6 +16,78 @@
#define _LINUX_NVME_H
#include <linux/types.h>
+#include <linux/uuid.h>
+
+/* NQN names in commands fields specified one size */
+#define NVMF_NQN_FIELD_LEN 256
+
+/* However the max length of a qualified name is another size */
+#define NVMF_NQN_SIZE 223
+
+#define NVMF_TRSVCID_SIZE 32
+#define NVMF_TRADDR_SIZE 256
+#define NVMF_TSAS_SIZE 256
+
+#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
+
+#define NVME_RDMA_IP_PORT 4420
+
+enum nvme_subsys_type {
+ NVME_NQN_DISC = 1, /* Discovery type target subsystem */
+ NVME_NQN_NVME = 2, /* NVME type target subsystem */
+};
+
+/* Address Family codes for Discovery Log Page entry ADRFAM field */
+enum {
+ NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */
+ NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */
+ NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */
+ NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */
+ NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */
+};
+
+/* Transport Type codes for Discovery Log Page entry TRTYPE field */
+enum {
+ NVMF_TRTYPE_RDMA = 1, /* RDMA */
+ NVMF_TRTYPE_FC = 2, /* Fibre Channel */
+ NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */
+ NVMF_TRTYPE_MAX,
+};
+
+/* Transport Requirements codes for Discovery Log Page entry TREQ field */
+enum {
+ NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */
+ NVMF_TREQ_REQUIRED = 1, /* Required */
+ NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */
+};
+
+/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
+ * RDMA_QPTYPE field
+ */
+enum {
+ NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */
+ NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */
+};
+
+/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
+ * RDMA_QPTYPE field
+ */
+enum {
+ NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */
+ NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */
+ NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */
+ NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */
+ NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */
+};
+
+/* RDMA Connection Management Service Type codes for Discovery Log Page
+ * entry TSAS RDMA_CMS field
+ */
+enum {
+ NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */
+};
+
+#define NVMF_AQ_DEPTH 32
enum {
NVME_REG_CAP = 0x0000, /* Controller Capabilities */
@@ -50,6 +122,13 @@ enum {
#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2)
#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1)
+/*
+ * Submission and Completion Queue Entry Sizes for the NVM command set.
+ * (In bytes and specified as a power of two (2^n)).
+ */
+#define NVME_NVM_IOSQES 6
+#define NVME_NVM_IOCQES 4
+
enum {
NVME_CC_ENABLE = 1 << 0,
NVME_CC_CSS_NVM = 0 << 4,
@@ -61,8 +140,8 @@ enum {
NVME_CC_SHN_NORMAL = 1 << 14,
NVME_CC_SHN_ABRUPT = 2 << 14,
NVME_CC_SHN_MASK = 3 << 14,
- NVME_CC_IOSQES = 6 << 16,
- NVME_CC_IOCQES = 4 << 20,
+ NVME_CC_IOSQES = NVME_NVM_IOSQES << 16,
+ NVME_CC_IOCQES = NVME_NVM_IOCQES << 20,
NVME_CSTS_RDY = 1 << 0,
NVME_CSTS_CFS = 1 << 1,
NVME_CSTS_NSSRO = 1 << 4,
@@ -107,7 +186,11 @@ struct nvme_id_ctrl {
__u8 mdts;
__le16 cntlid;
__le32 ver;
- __u8 rsvd84[172];
+ __le32 rtd3r;
+ __le32 rtd3e;
+ __le32 oaes;
+ __le32 ctratt;
+ __u8 rsvd100[156];
__le16 oacs;
__u8 acl;
__u8 aerl;
@@ -119,10 +202,12 @@ struct nvme_id_ctrl {
__u8 apsta;
__le16 wctemp;
__le16 cctemp;
- __u8 rsvd270[242];
+ __u8 rsvd270[50];
+ __le16 kas;
+ __u8 rsvd322[190];
__u8 sqes;
__u8 cqes;
- __u8 rsvd514[2];
+ __le16 maxcmd;
__le32 nn;
__le16 oncs;
__le16 fuses;
@@ -135,7 +220,15 @@ struct nvme_id_ctrl {
__le16 acwu;
__u8 rsvd534[2];
__le32 sgls;
- __u8 rsvd540[1508];
+ __u8 rsvd540[228];
+ char subnqn[256];
+ __u8 rsvd1024[768];
+ __le32 ioccsz;
+ __le32 iorcsz;
+ __le16 icdoff;
+ __u8 ctrattr;
+ __u8 msdbd;
+ __u8 rsvd1804[244];
struct nvme_id_power_state psd[32];
__u8 vs[1024];
};
@@ -274,6 +367,12 @@ struct nvme_reservation_status {
} regctl_ds[];
};
+enum nvme_async_event_type {
+ NVME_AER_TYPE_ERROR = 0,
+ NVME_AER_TYPE_SMART = 1,
+ NVME_AER_TYPE_NOTICE = 2,
+};
+
/* I/O commands */
enum nvme_opcode {
@@ -290,6 +389,84 @@ enum nvme_opcode {
nvme_cmd_resv_release = 0x15,
};
+/*
+ * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier
+ *
+ * @NVME_SGL_FMT_ADDRESS: absolute address of the data block
+ * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block
+ * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation
+ * request subtype
+ */
+enum {
+ NVME_SGL_FMT_ADDRESS = 0x00,
+ NVME_SGL_FMT_OFFSET = 0x01,
+ NVME_SGL_FMT_INVALIDATE = 0x0f,
+};
+
+/*
+ * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier
+ *
+ * For struct nvme_sgl_desc:
+ * @NVME_SGL_FMT_DATA_DESC: data block descriptor
+ * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor
+ * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor
+ *
+ * For struct nvme_keyed_sgl_desc:
+ * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor
+ */
+enum {
+ NVME_SGL_FMT_DATA_DESC = 0x00,
+ NVME_SGL_FMT_SEG_DESC = 0x02,
+ NVME_SGL_FMT_LAST_SEG_DESC = 0x03,
+ NVME_KEY_SGL_FMT_DATA_DESC = 0x04,
+};
+
+struct nvme_sgl_desc {
+ __le64 addr;
+ __le32 length;
+ __u8 rsvd[3];
+ __u8 type;
+};
+
+struct nvme_keyed_sgl_desc {
+ __le64 addr;
+ __u8 length[3];
+ __u8 key[4];
+ __u8 type;
+};
+
+union nvme_data_ptr {
+ struct {
+ __le64 prp1;
+ __le64 prp2;
+ };
+ struct nvme_sgl_desc sgl;
+ struct nvme_keyed_sgl_desc ksgl;
+};
+
+/*
+ * Lowest two bits of our flags field (FUSE field in the spec):
+ *
+ * @NVME_CMD_FUSE_FIRST: Fused Operation, first command
+ * @NVME_CMD_FUSE_SECOND: Fused Operation, second command
+ *
+ * Highest two bits in our flags field (PSDT field in the spec):
+ *
+ * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer,
+ * If used, MPTR contains addr of single physical buffer (byte aligned).
+ * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer,
+ * If used, MPTR contains an address of an SGL segment containing
+ * exactly 1 SGL descriptor (qword aligned).
+ */
+enum {
+ NVME_CMD_FUSE_FIRST = (1 << 0),
+ NVME_CMD_FUSE_SECOND = (1 << 1),
+
+ NVME_CMD_SGL_METABUF = (1 << 6),
+ NVME_CMD_SGL_METASEG = (1 << 7),
+ NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG,
+};
+
struct nvme_common_command {
__u8 opcode;
__u8 flags;
@@ -297,8 +474,7 @@ struct nvme_common_command {
__le32 nsid;
__le32 cdw2[2];
__le64 metadata;
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le32 cdw10[6];
};
@@ -309,8 +485,7 @@ struct nvme_rw_command {
__le32 nsid;
__u64 rsvd2;
__le64 metadata;
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le64 slba;
__le16 length;
__le16 control;
@@ -350,8 +525,7 @@ struct nvme_dsm_cmd {
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le32 nr;
__le32 attributes;
__u32 rsvd12[4];
@@ -384,6 +558,7 @@ enum nvme_admin_opcode {
nvme_admin_async_event = 0x0c,
nvme_admin_activate_fw = 0x10,
nvme_admin_download_fw = 0x11,
+ nvme_admin_keep_alive = 0x18,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
@@ -408,6 +583,7 @@ enum {
NVME_FEAT_WRITE_ATOMIC = 0x0a,
NVME_FEAT_ASYNC_EVENT = 0x0b,
NVME_FEAT_AUTO_PST = 0x0c,
+ NVME_FEAT_KATO = 0x0f,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
@@ -415,6 +591,7 @@ enum {
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
+ NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
NVME_FWACT_REPL_ACTV = (1 << 3),
@@ -427,8 +604,7 @@ struct nvme_identify {
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le32 cns;
__u32 rsvd11[5];
};
@@ -439,8 +615,7 @@ struct nvme_features {
__u16 command_id;
__le32 nsid;
__u64 rsvd2[2];
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le32 fid;
__le32 dword11;
__u32 rsvd12[4];
@@ -499,8 +674,7 @@ struct nvme_download_firmware {
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
- __le64 prp1;
- __le64 prp2;
+ union nvme_data_ptr dptr;
__le32 numd;
__le32 offset;
__u32 rsvd12[4];
@@ -516,6 +690,143 @@ struct nvme_format_cmd {
__u32 rsvd11[5];
};
+struct nvme_get_log_page_command {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __u8 lid;
+ __u8 rsvd10;
+ __le16 numdl;
+ __le16 numdu;
+ __u16 rsvd11;
+ __le32 lpol;
+ __le32 lpou;
+ __u32 rsvd14[2];
+};
+
+/*
+ * Fabrics subcommands.
+ */
+enum nvmf_fabrics_opcode {
+ nvme_fabrics_command = 0x7f,
+};
+
+enum nvmf_capsule_command {
+ nvme_fabrics_type_property_set = 0x00,
+ nvme_fabrics_type_connect = 0x01,
+ nvme_fabrics_type_property_get = 0x04,
+};
+
+struct nvmf_common_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[35];
+ __u8 ts[24];
+};
+
+/*
+ * The legal cntlid range a NVMe Target will provide.
+ * Note that cntlid of value 0 is considered illegal in the fabrics world.
+ * Devices based on earlier specs did not have the subsystem concept;
+ * therefore, those devices had their cntlid value set to 0 as a result.
+ */
+#define NVME_CNTLID_MIN 1
+#define NVME_CNTLID_MAX 0xffef
+#define NVME_CNTLID_DYNAMIC 0xffff
+
+#define MAX_DISC_LOGS 255
+
+/* Discovery log page entry */
+struct nvmf_disc_rsp_page_entry {
+ __u8 trtype;
+ __u8 adrfam;
+ __u8 nqntype;
+ __u8 treq;
+ __le16 portid;
+ __le16 cntlid;
+ __le16 asqsz;
+ __u8 resv8[22];
+ char trsvcid[NVMF_TRSVCID_SIZE];
+ __u8 resv64[192];
+ char subnqn[NVMF_NQN_FIELD_LEN];
+ char traddr[NVMF_TRADDR_SIZE];
+ union tsas {
+ char common[NVMF_TSAS_SIZE];
+ struct rdma {
+ __u8 qptype;
+ __u8 prtype;
+ __u8 cms;
+ __u8 resv3[5];
+ __u16 pkey;
+ __u8 resv10[246];
+ } rdma;
+ } tsas;
+};
+
+/* Discovery log page header */
+struct nvmf_disc_rsp_page_hdr {
+ __le64 genctr;
+ __le64 numrec;
+ __le16 recfmt;
+ __u8 resv14[1006];
+ struct nvmf_disc_rsp_page_entry entries[0];
+};
+
+struct nvmf_connect_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[19];
+ union nvme_data_ptr dptr;
+ __le16 recfmt;
+ __le16 qid;
+ __le16 sqsize;
+ __u8 cattr;
+ __u8 resv3;
+ __le32 kato;
+ __u8 resv4[12];
+};
+
+struct nvmf_connect_data {
+ uuid_le hostid;
+ __le16 cntlid;
+ char resv4[238];
+ char subsysnqn[NVMF_NQN_FIELD_LEN];
+ char hostnqn[NVMF_NQN_FIELD_LEN];
+ char resv5[256];
+};
+
+struct nvmf_property_set_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[35];
+ __u8 attrib;
+ __u8 resv3[3];
+ __le32 offset;
+ __le64 value;
+ __u8 resv4[8];
+};
+
+struct nvmf_property_get_command {
+ __u8 opcode;
+ __u8 resv1;
+ __u16 command_id;
+ __u8 fctype;
+ __u8 resv2[35];
+ __u8 attrib;
+ __u8 resv3[3];
+ __le32 offset;
+ __u8 resv4[16];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -529,10 +840,30 @@ struct nvme_command {
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
struct nvme_abort_cmd abort;
+ struct nvme_get_log_page_command get_log_page;
+ struct nvmf_common_command fabrics;
+ struct nvmf_connect_command connect;
+ struct nvmf_property_set_command prop_set;
+ struct nvmf_property_get_command prop_get;
};
};
+static inline bool nvme_is_write(struct nvme_command *cmd)
+{
+ /*
+ * What a mess...
+ *
+ * Why can't we simply have a Fabrics In and Fabrics out command?
+ */
+ if (unlikely(cmd->common.opcode == nvme_fabrics_command))
+ return cmd->fabrics.opcode & 1;
+ return cmd->common.opcode & 1;
+}
+
enum {
+ /*
+ * Generic Command Status:
+ */
NVME_SC_SUCCESS = 0x0,
NVME_SC_INVALID_OPCODE = 0x1,
NVME_SC_INVALID_FIELD = 0x2,
@@ -551,10 +882,18 @@ enum {
NVME_SC_SGL_INVALID_DATA = 0xf,
NVME_SC_SGL_INVALID_METADATA = 0x10,
NVME_SC_SGL_INVALID_TYPE = 0x11,
+
+ NVME_SC_SGL_INVALID_OFFSET = 0x16,
+ NVME_SC_SGL_INVALID_SUBTYPE = 0x17,
+
NVME_SC_LBA_RANGE = 0x80,
NVME_SC_CAP_EXCEEDED = 0x81,
NVME_SC_NS_NOT_READY = 0x82,
NVME_SC_RESERVATION_CONFLICT = 0x83,
+
+ /*
+ * Command Specific Status:
+ */
NVME_SC_CQ_INVALID = 0x100,
NVME_SC_QID_INVALID = 0x101,
NVME_SC_QUEUE_SIZE = 0x102,
@@ -572,9 +911,29 @@ enum {
NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e,
NVME_SC_FEATURE_NOT_PER_NS = 0x10f,
NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110,
+
+ /*
+ * I/O Command Set Specific - NVM commands:
+ */
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
+
+ /*
+ * I/O Command Set Specific - Fabrics commands:
+ */
+ NVME_SC_CONNECT_FORMAT = 0x180,
+ NVME_SC_CONNECT_CTRL_BUSY = 0x181,
+ NVME_SC_CONNECT_INVALID_PARAM = 0x182,
+ NVME_SC_CONNECT_RESTART_DISC = 0x183,
+ NVME_SC_CONNECT_INVALID_HOST = 0x184,
+
+ NVME_SC_DISCOVERY_RESTART = 0x190,
+ NVME_SC_AUTH_REQUIRED = 0x191,
+
+ /*
+ * Media and Data Integrity Errors:
+ */
NVME_SC_WRITE_FAULT = 0x280,
NVME_SC_READ_ERROR = 0x281,
NVME_SC_GUARD_CHECK = 0x282,
@@ -582,12 +941,19 @@ enum {
NVME_SC_REFTAG_CHECK = 0x284,
NVME_SC_COMPARE_FAILED = 0x285,
NVME_SC_ACCESS_DENIED = 0x286,
+
NVME_SC_DNR = 0x4000,
};
struct nvme_completion {
- __le32 result; /* Used by admin commands to return data */
- __u32 rsvd;
+ /*
+ * Used by Admin and Fabrics commands to return data:
+ */
+ union {
+ __le16 result16;
+ __le32 result;
+ __le64 result64;
+ };
__le16 sq_head; /* how much of this queue may be reclaimed */
__le16 sq_id; /* submission queue that generated this entry */
__u16 command_id; /* of the command which completed */