From 0b5da8db145bfd44266ac964a2636a0cf8d7c286 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Tue, 30 Jun 2015 23:40:22 +0530 Subject: fuse: add support for SEEK_HOLE and SEEK_DATA in lseek A useful performance improvement for accessing virtual machine images via FUSE mount. See https://bugzilla.redhat.com/show_bug.cgi?id=1220173 for a use-case for glusterFS. Signed-off-by: Ravishankar N Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c9aca042e61d..5974fae54e12 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -102,6 +102,9 @@ * - add ctime and ctimensec to fuse_setattr_in * - add FUSE_RENAME2 request * - add FUSE_NO_OPEN_SUPPORT flag + * + * 7.24 + * - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support */ #ifndef _LINUX_FUSE_H @@ -137,7 +140,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 23 +#define FUSE_KERNEL_MINOR_VERSION 24 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -358,6 +361,7 @@ enum fuse_opcode { FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, FUSE_RENAME2 = 45, + FUSE_LSEEK = 46, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -758,4 +762,15 @@ struct fuse_notify_retrieve_in { /* Device ioctls: */ #define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) +struct fuse_lseek_in { + uint64_t fh; + uint64_t offset; + uint32_t whence; + uint32_t padding; +}; + +struct fuse_lseek_out { + uint64_t offset; +}; + #endif /* _LINUX_FUSE_H */ -- cgit v1.2.3 From 257f871993474e2bde6c497b54022c362cf398e1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 4 Nov 2015 10:59:52 -0800 Subject: ovl: move super block magic number to magic.h The overlayfs file system is not recognized by programs like tail because the magic number is not in standard header location. Move it so that the value will propagate on for the GNU library and utilities. Needs to go in the fstatfs manual page as well. Signed-off-by: Stephen Hemminger Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 2 -- include/uapi/linux/magic.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 97cacb525974..32f31243d36a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -24,8 +24,6 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c7630 - struct ovl_config { char *lowerdir; char *upperdir; diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 7b1425a6b370..eec438952aa7 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -31,6 +31,7 @@ #define PSTOREFS_MAGIC 0x6165676C #define EFIVARFS_MAGIC 0xde5e81e4 #define HOSTFS_SUPER_MAGIC 0x00c0ffee +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ -- cgit v1.2.3 From 89545d6d5cb0fab53211d6a36a8037c8020e1b7e Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Wed, 2 Dec 2015 23:17:47 +0100 Subject: include/uapi/linux/virtio_gpu.h: use __u8 from MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel headers exported to userspace are should these types. Fixes userspace compilation error: error: unknown type name ‘uint8_t’ Signed-off-by: Mikko Rapeli --- include/uapi/linux/virtio_gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h index 7a63faa9065c..4b04ead26cd9 100644 --- a/include/uapi/linux/virtio_gpu.h +++ b/include/uapi/linux/virtio_gpu.h @@ -287,7 +287,7 @@ struct virtio_gpu_get_capset { /* VIRTIO_GPU_RESP_OK_CAPSET */ struct virtio_gpu_resp_capset { struct virtio_gpu_ctrl_hdr hdr; - uint8_t capset_data[]; + __u8 capset_data[]; }; #define VIRTIO_GPU_EVENT_DISPLAY (1 << 0) -- cgit v1.2.3 From 35d0f1d54ecd813973091fdb17baae43ea2b60b9 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 1 Feb 2015 15:10:41 +0100 Subject: include/uapi/linux/agpgart.h: include stdlib.h in userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compiler error: error: unknown type name ‘size_t’ Signed-off-by: Mikko Rapeli --- include/uapi/linux/agpgart.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/agpgart.h b/include/uapi/linux/agpgart.h index 4e828cf487bc..f5251045181a 100644 --- a/include/uapi/linux/agpgart.h +++ b/include/uapi/linux/agpgart.h @@ -52,6 +52,7 @@ #ifndef __KERNEL__ #include +#include struct agp_version { __u16 major; -- cgit v1.2.3 From 5ca4c20cfd37bac6486de040e9951b3b34755238 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 5 Nov 2015 21:43:06 +0200 Subject: keys, trusted: select hash algorithm for TPM2 chips Added 'hash=' option for selecting the hash algorithm for add_key() syscall and documentation for it. Added entry for sm3-256 to the following tables in order to support TPM_ALG_SM3_256: * hash_algo_name * hash_digest_size Includes support for the following hash algorithms: * sha1 * sha256 * sha384 * sha512 * sm3-256 Signed-off-by: Jarkko Sakkinen Tested-by: Colin Ian King Reviewed-by: James Morris Reviewed-by: Mimi Zohar Acked-by: Peter Huewe --- Documentation/security/keys-trusted-encrypted.txt | 3 ++ crypto/hash_info.c | 2 ++ drivers/char/tpm/tpm.h | 10 +++++-- drivers/char/tpm/tpm2-cmd.c | 36 +++++++++++++++++++++-- include/crypto/hash_info.h | 3 ++ include/keys/trusted-type.h | 1 + include/uapi/linux/hash_info.h | 1 + security/keys/Kconfig | 1 + security/keys/trusted.c | 27 ++++++++++++++++- 9 files changed, 77 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/security/keys-trusted-encrypted.txt b/Documentation/security/keys-trusted-encrypted.txt index e105ae97a4f5..fd2565b301e8 100644 --- a/Documentation/security/keys-trusted-encrypted.txt +++ b/Documentation/security/keys-trusted-encrypted.txt @@ -38,6 +38,9 @@ Usage: pcrlock= pcr number to be extended to "lock" blob migratable= 0|1 indicating permission to reseal to new PCR values, default 1 (resealing allowed) + hash= hash algorithm name as a string. For TPM 1.x the only + allowed value is sha1. For TPM 2.x the allowed values + are sha1, sha256, sha384, sha512 and sm3-256. "keyctl print" returns an ascii hex copy of the sealed key, which is in standard TPM_STORED_DATA format. The key length for new keys are always in bytes. diff --git a/crypto/hash_info.c b/crypto/hash_info.c index 3e7ff46f26e8..7b1e0b188ce6 100644 --- a/crypto/hash_info.c +++ b/crypto/hash_info.c @@ -31,6 +31,7 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = { [HASH_ALGO_TGR_128] = "tgr128", [HASH_ALGO_TGR_160] = "tgr160", [HASH_ALGO_TGR_192] = "tgr192", + [HASH_ALGO_SM3_256] = "sm3-256", }; EXPORT_SYMBOL_GPL(hash_algo_name); @@ -52,5 +53,6 @@ const int hash_digest_size[HASH_ALGO__LAST] = { [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE, [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, + [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE, }; EXPORT_SYMBOL_GPL(hash_digest_size); diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 347fc615bcc9..542a80cbfd9c 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -83,16 +83,20 @@ enum tpm2_structures { }; enum tpm2_return_codes { - TPM2_RC_INITIALIZE = 0x0100, - TPM2_RC_TESTING = 0x090A, + TPM2_RC_HASH = 0x0083, /* RC_FMT1 */ + TPM2_RC_INITIALIZE = 0x0100, /* RC_VER1 */ TPM2_RC_DISABLED = 0x0120, + TPM2_RC_TESTING = 0x090A, /* RC_WARN */ }; enum tpm2_algorithms { TPM2_ALG_SHA1 = 0x0004, TPM2_ALG_KEYEDHASH = 0x0008, TPM2_ALG_SHA256 = 0x000B, - TPM2_ALG_NULL = 0x0010 + TPM2_ALG_SHA384 = 0x000C, + TPM2_ALG_SHA512 = 0x000D, + TPM2_ALG_NULL = 0x0010, + TPM2_ALG_SM3_256 = 0x0012, }; enum tpm2_command_codes { diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index c12130485fc1..d9d082206f6e 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -16,6 +16,7 @@ */ #include "tpm.h" +#include #include enum tpm2_object_attributes { @@ -104,6 +105,19 @@ struct tpm2_cmd { union tpm2_cmd_params params; } __packed; +struct tpm2_hash { + unsigned int crypto_id; + unsigned int tpm_id; +}; + +static struct tpm2_hash tpm2_hash_map[] = { + {HASH_ALGO_SHA1, TPM2_ALG_SHA1}, + {HASH_ALGO_SHA256, TPM2_ALG_SHA256}, + {HASH_ALGO_SHA384, TPM2_ALG_SHA384}, + {HASH_ALGO_SHA512, TPM2_ALG_SHA512}, + {HASH_ALGO_SM3_256, TPM2_ALG_SM3_256}, +}; + /* * Array with one entry per ordinal defining the maximum amount * of time the chip could take to return the result. The values @@ -429,8 +443,20 @@ int tpm2_seal_trusted(struct tpm_chip *chip, { unsigned int blob_len; struct tpm_buf buf; + u32 hash; + int i; int rc; + for (i = 0; i < ARRAY_SIZE(tpm2_hash_map); i++) { + if (options->hash == tpm2_hash_map[i].crypto_id) { + hash = tpm2_hash_map[i].tpm_id; + break; + } + } + + if (i == ARRAY_SIZE(tpm2_hash_map)) + return -EINVAL; + rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_CREATE); if (rc) return rc; @@ -455,7 +481,7 @@ int tpm2_seal_trusted(struct tpm_chip *chip, tpm_buf_append_u16(&buf, 14); tpm_buf_append_u16(&buf, TPM2_ALG_KEYEDHASH); - tpm_buf_append_u16(&buf, TPM2_ALG_SHA256); + tpm_buf_append_u16(&buf, hash); tpm_buf_append_u32(&buf, TPM2_ATTR_USER_WITH_AUTH); tpm_buf_append_u16(&buf, 0); /* policy digest size */ tpm_buf_append_u16(&buf, TPM2_ALG_NULL); @@ -488,8 +514,12 @@ int tpm2_seal_trusted(struct tpm_chip *chip, out: tpm_buf_destroy(&buf); - if (rc > 0) - rc = -EPERM; + if (rc > 0) { + if ((rc & TPM2_RC_HASH) == TPM2_RC_HASH) + rc = -EINVAL; + else + rc = -EPERM; + } return rc; } diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h index e1e5a3e5dd1b..56f217d41f12 100644 --- a/include/crypto/hash_info.h +++ b/include/crypto/hash_info.h @@ -34,6 +34,9 @@ #define TGR160_DIGEST_SIZE 20 #define TGR192_DIGEST_SIZE 24 +/* not defined in include/crypto/ */ +#define SM3256_DIGEST_SIZE 32 + extern const char *const hash_algo_name[HASH_ALGO__LAST]; extern const int hash_digest_size[HASH_ALGO__LAST]; diff --git a/include/keys/trusted-type.h b/include/keys/trusted-type.h index f91ecd9d1bb1..a6a100833ae9 100644 --- a/include/keys/trusted-type.h +++ b/include/keys/trusted-type.h @@ -36,6 +36,7 @@ struct trusted_key_options { uint32_t pcrinfo_len; unsigned char pcrinfo[MAX_PCRINFO_SIZE]; int pcrlock; + uint32_t hash; }; extern struct key_type key_type_trusted; diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h index ca18c45f8304..ebf8fd885dd5 100644 --- a/include/uapi/linux/hash_info.h +++ b/include/uapi/linux/hash_info.h @@ -31,6 +31,7 @@ enum hash_algo { HASH_ALGO_TGR_128, HASH_ALGO_TGR_160, HASH_ALGO_TGR_192, + HASH_ALGO_SM3_256, HASH_ALGO__LAST }; diff --git a/security/keys/Kconfig b/security/keys/Kconfig index 72483b8f1be5..fe4d74e126a7 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -54,6 +54,7 @@ config TRUSTED_KEYS select CRYPTO select CRYPTO_HMAC select CRYPTO_SHA1 + select CRYPTO_HASH_INFO help This option provides support for creating, sealing, and unsealing keys in the kernel. Trusted keys are random number symmetric keys, diff --git a/security/keys/trusted.c b/security/keys/trusted.c index 7c183c767a3a..8f1300cab38e 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -11,6 +11,7 @@ * See Documentation/security/keys-trusted-encrypted.txt */ +#include #include #include #include @@ -710,7 +711,8 @@ enum { Opt_err = -1, Opt_new, Opt_load, Opt_update, Opt_keyhandle, Opt_keyauth, Opt_blobauth, - Opt_pcrinfo, Opt_pcrlock, Opt_migratable + Opt_pcrinfo, Opt_pcrlock, Opt_migratable, + Opt_hash, }; static const match_table_t key_tokens = { @@ -723,6 +725,7 @@ static const match_table_t key_tokens = { {Opt_pcrinfo, "pcrinfo=%s"}, {Opt_pcrlock, "pcrlock=%s"}, {Opt_migratable, "migratable=%s"}, + {Opt_hash, "hash=%s"}, {Opt_err, NULL} }; @@ -737,6 +740,14 @@ static int getoptions(char *c, struct trusted_key_payload *pay, unsigned long handle; unsigned long lock; unsigned long token_mask = 0; + int i; + int tpm2; + + tpm2 = tpm_is_tpm2(TPM_ANY_NUM); + if (tpm2 < 0) + return tpm2; + + opt->hash = tpm2 ? HASH_ALGO_SHA256 : HASH_ALGO_SHA1; while ((p = strsep(&c, " \t"))) { if (*p == '\0' || *p == ' ' || *p == '\t') @@ -790,6 +801,20 @@ static int getoptions(char *c, struct trusted_key_payload *pay, return -EINVAL; opt->pcrlock = lock; break; + case Opt_hash: + for (i = 0; i < HASH_ALGO__LAST; i++) { + if (!strcmp(args[0].from, hash_algo_name[i])) { + opt->hash = i; + break; + } + } + if (i == HASH_ALGO__LAST) + return -EINVAL; + if (!tpm2 && i != HASH_ALGO_SHA1) { + pr_info("trusted_key: TPM 1.x only supports SHA-1.\n"); + return -EINVAL; + } + break; default: return -EINVAL; } -- cgit v1.2.3 From 334e580a6f97e2e84d1c19a8679603956acaa622 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Jan 2016 16:44:15 +1100 Subject: fs: XFS_IOC_FS[SG]SETXATTR to FS_IOC_FS[SG]ETXATTR promotion Hoist the ioctl definitions for the XFS_IOC_FS[SG]SETXATTR API from fs/xfs/libxfs/xfs_fs.h to include/uapi/linux/fs.h so that the ioctls can be used by all filesystems, not just XFS. This enables (initially) ext4 to use the ioctl to set project IDs on inodes. Based-on-patch-from: Li Xi Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_fs.h | 51 +++++++++++++++++-------------------------------- include/uapi/linux/fs.h | 32 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 33 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998d42..dd29d0a670bd 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,38 +36,23 @@ struct dioattr { #endif /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. + * Flags for the bs_xflags/fsx_xflags field in FS_IOC_FS[GS]ETXATTR[A] */ -#ifndef HAVE_FSXATTR -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#define XFS_XFLAG_REALTIME FS_XFLAG_REALTIME +#define XFS_XFLAG_PREALLOC FS_XFLAG_PREALLOC +#define XFS_XFLAG_IMMUTABLE FS_XFLAG_IMMUTABLE +#define XFS_XFLAG_APPEND FS_XFLAG_APPEND +#define XFS_XFLAG_SYNC FS_XFLAG_SYNC +#define XFS_XFLAG_NOATIME FS_XFLAG_NOATIME +#define XFS_XFLAG_NODUMP FS_XFLAG_NODUMP +#define XFS_XFLAG_RTINHERIT FS_XFLAG_RTINHERIT +#define XFS_XFLAG_PROJINHERIT FS_XFLAG_PROJINHERIT +#define XFS_XFLAG_NOSYMLINKS FS_XFLAG_NOSYMLINKS +#define XFS_XFLAG_EXTSIZE FS_XFLAG_EXTSIZE +#define XFS_XFLAG_EXTSZINHERIT FS_XFLAG_EXTSZINHERIT +#define XFS_XFLAG_NODEFRAG FS_XFLAG_NODEFRAG +#define XFS_XFLAG_FILESTREAM FS_XFLAG_FILESTREAM +#define XFS_XFLAG_HASATTR FS_XFLAG_HASATTR /* * Structure for XFS_IOC_GETBMAP. @@ -514,8 +499,8 @@ typedef struct xfs_swapext #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..df175dddd07f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -110,6 +110,36 @@ struct inodes_stat_t { #define MS_MGC_VAL 0xC0ED0000 #define MS_MGC_MSK 0xffff0000 +/* + * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ @@ -169,6 +199,8 @@ struct inodes_stat_t { #define FS_IOC32_SETFLAGS _IOW('f', 2, int) #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +#define FS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) -- cgit v1.2.3 From 58f88ca2df7270881de2034c8286233a89efe71c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Jan 2016 16:44:15 +1100 Subject: xfs: introduce per-inode DAX enablement Rather than just being able to turn DAX on and off via a mount option, some applications may only want to enable DAX for certain performance critical files in a filesystem. This patch introduces a new inode flag to enable DAX in the v3 inode di_flags2 field. It adds support for setting and clearing flags in the di_flags2 field via the XFS_IOC_FSSETXATTR ioctl, and sets the S_DAX inode flag appropriately when it is seen. When this flag is set on a directory, it acts as an "inherit flag". That is, inodes created in the directory will automatically inherit the on-disk inode DAX flag, enabling administrators to set up directory heirarchies that automatically use DAX. Setting this flag on an empty root directory will make the entire filesystem use DAX by default. Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 9 +++++++++ fs/xfs/xfs_inode.c | 32 +++++++++++++++++++++++--------- fs/xfs/xfs_ioctl.c | 18 +++++++++++++++++- fs/xfs/xfs_iops.c | 4 ++-- include/uapi/linux/fs.h | 1 + 5 files changed, 52 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f28eeabb9454..b4ae7cef028a 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1023,6 +1023,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) +/* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) + /* * Inode number format: * low inopblog bits - offset in block diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ca9ca5a3c535..89299085450e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,7 +610,9 @@ __xfs_iflock( STATIC uint _xfs_dic2xflags( - __uint16_t di_flags) + __uint16_t di_flags, + uint64_t di_flags2, + bool has_attr) { uint flags = 0; @@ -645,25 +647,32 @@ _xfs_dic2xflags( flags |= FS_XFLAG_FILESTREAM; } + if (di_flags2 & XFS_DIFLAG2_ANY) { + if (di_flags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + } + + if (has_attr) + flags |= FS_XFLAG_HASATTR; + return flags; } uint xfs_ip2xflags( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_icdinode_t *dic = &ip->i_d; + struct xfs_icdinode *dic = &ip->i_d; - return _xfs_dic2xflags(dic->di_flags) | - (XFS_IFORK_Q(ip) ? FS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } uint xfs_dic2xflags( - xfs_dinode_t *dip) + struct xfs_dinode *dip) { - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | - (XFS_DFORK_Q(dip) ? FS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), + be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); } /* @@ -862,7 +871,8 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; + uint64_t di_flags2 = 0; + uint di_flags = 0; if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + ip->i_d.di_flags |= di_flags; + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 94b35eb32cb6..478d04e07f95 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -945,6 +945,7 @@ xfs_set_diflags( unsigned int xflags) { unsigned int di_flags; + uint64_t di_flags2; /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); @@ -977,8 +978,18 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; + + /* diflags2 only valid for v3 inodes. */ + if (ip->i_d.di_version < 3) + return; + + di_flags2 = 0; + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 = di_flags2; + } STATIC void @@ -1004,6 +1015,11 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + if (xflags & FS_XFLAG_DAX) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } static int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 245268a0cdf0..a1b8af158d05 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1201,8 +1201,8 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - /* XXX: Also needs an on-disk per inode flag! */ - if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + if (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) inode->i_flags |= S_DAX; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index df175dddd07f..4cad4c8ab6b2 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -138,6 +138,7 @@ struct fsxattr { #define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* the read-only stuff doesn't really belong here, but any other place is -- cgit v1.2.3 From 68ce7bfcd995a8a393b1b14fa67dbc16fa3dc784 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 8 Jan 2016 16:01:25 -0500 Subject: fs: clean up the flags definition in uapi/linux/fs.h Add an explanation for the flags used by FS_IOC_[GS]ETFLAGS and remind people that changes should be revised by linux-fsdevel and linux-api. Add flags that are used on-disk for ext4, and remove FS_DIRECTIO_FL since it was used only by gfs2 and support was removed in 2008 in commit c9f6a6bbc28 ("The ability to mark files for direct i/o access when opened normally is both unused and pointless, so this patch removes support for that feature.") Now we have _two_ remaining flags left. But since we want to discourage people from assigning new flags, that's OK. Signed-off-by: Theodore Ts'o --- include/uapi/linux/fs.h | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..c5083d2e2c02 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -2,8 +2,11 @@ #define _UAPI_LINUX_FS_H /* - * This file has definitions for some important file table - * structures etc. + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. */ #include @@ -172,6 +175,23 @@ struct inodes_stat_t { /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) + * + * Note: for historical reasons, these flags were originally used and + * defined for use by ext2/ext3, and then other file systems started + * using these flags so they wouldn't need to write their own version + * of chattr/lsattr (which was shipped as part of e2fsprogs). You + * should think twice before trying to use these flags in new + * contexts, or trying to assign these flags, since they are used both + * as the UAPI and the on-disk encoding for ext2/3/4. Also, we are + * almost out of 32-bit flags. :-) + * + * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from + * XFS to the generic FS level interface. This uses a structure that + * has padding and hence has more room to grow, so it may be more + * appropriate for many new use cases. + * + * Please do not change these flags or interfaces before checking with + * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org. */ #define FS_SECRM_FL 0x00000001 /* Secure deletion */ #define FS_UNRM_FL 0x00000002 /* Undelete */ @@ -185,8 +205,8 @@ struct inodes_stat_t { #define FS_DIRTY_FL 0x00000100 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ -#define FS_ECOMPR_FL 0x00000800 /* Compression error */ /* End compression flags --- maybe not all used */ +#define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */ #define FS_BTREE_FL 0x00001000 /* btree format dir */ #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -194,9 +214,12 @@ struct inodes_stat_t { #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ #define FS_EXTENT_FL 0x00080000 /* Extents */ -#define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ +#define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ -- cgit v1.2.3 From e3eb3799f7e0d0924ceeba672ab271865de2802d Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 12 Jan 2016 07:49:36 +0100 Subject: lightnvm: core on-disk initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An Open-Channel SSD shall be initialized before use. To initialize, we define an on-disk format, that keeps a small set of metadata to bring up the media manager on top of the device. The initial step is introduced to allow a user to format the disks for a given media manager. During format, a system block is stored on one to three separate luns on the device. Each lun has the system block duplicated. During initialization, the system block can be retrieved and the appropriate media manager can initialized. The on-disk format currently covers (struct nvm_system_block): - Magic value "NVMS". - Monotonic increasing sequence number. - The physical block erase count. - Version of the system block format. - Media manager type. - Media manager superblock physical address. The interface provides three functions to manage the system block: int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *) int nvm_get_sysblock(struct nvm *dev, struct nvm_sb_info *) int nvm_update_sysblock(struct nvm *dev, struct nvm_sb_info *) Each implement a part of the logic to manage the system block. The initialization creates the first system blocks and mark them on the device. Get retrieves the latest system block by scanning all pages in the associated system blocks. The update sysblock writes new metadata and allocates new block if necessary. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Makefile | 2 +- drivers/lightnvm/core.c | 1 + drivers/lightnvm/sysblk.c | 562 ++++++++++++++++++++++++++++++++++++++++++ include/linux/lightnvm.h | 35 +++ include/uapi/linux/lightnvm.h | 1 + 5 files changed, 600 insertions(+), 1 deletion(-) create mode 100644 drivers/lightnvm/sysblk.c (limited to 'include/uapi/linux') diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 7e0f42acb737..a7a0a22cf1a5 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,6 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o +obj-$(CONFIG_NVM) := core.o sysblk.o obj-$(CONFIG_NVM_GENNVM) += gennvm.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 1f302cc73e0b..73b8ae1dafc4 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -466,6 +466,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->nr_chnls; dev->total_pages = dev->total_blocks * dev->pgs_per_blk; INIT_LIST_HEAD(&dev->online_targets); + mutex_init(&dev->mlock); return 0; } diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c new file mode 100644 index 000000000000..b8489f425b05 --- /dev/null +++ b/drivers/lightnvm/sysblk.c @@ -0,0 +1,562 @@ +/* + * Copyright (C) 2015 Matias Bjorling. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include + +#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */ +#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases + * enables ~1.5M updates per sysblk unit + */ + +struct sysblk_scan { + /* A row is a collection of flash blocks for a system block. */ + int nr_rows; + int row; + int act_blk[MAX_SYSBLKS]; + + int nr_ppas; + struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */ +}; + +static inline int scan_ppa_idx(int row, int blkid) +{ + return (row * MAX_BLKS_PR_SYSBLK) + blkid; +} + +void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) +{ + info->seqnr = be32_to_cpu(sb->seqnr); + info->erase_cnt = be32_to_cpu(sb->erase_cnt); + info->version = be16_to_cpu(sb->version); + strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN); + info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); +} + +void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) +{ + sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); + sb->seqnr = cpu_to_be32(info->seqnr); + sb->erase_cnt = cpu_to_be32(info->erase_cnt); + sb->version = cpu_to_be16(info->version); + strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN); + sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa); +} + +static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) +{ + int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls); + int i; + + for (i = 0; i < nr_rows; i++) + sysblk_ppas[i].ppa = 0; + + /* if possible, place sysblk at first channel, middle channel and last + * channel of the device. If not, create only one or two sys blocks + */ + switch (dev->nr_chnls) { + case 2: + sysblk_ppas[1].g.ch = 1; + /* fall-through */ + case 1: + sysblk_ppas[0].g.ch = 0; + break; + default: + sysblk_ppas[0].g.ch = 0; + sysblk_ppas[1].g.ch = dev->nr_chnls / 2; + sysblk_ppas[2].g.ch = dev->nr_chnls - 1; + break; + } + + return nr_rows; +} + +void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *sysblk_ppas) +{ + memset(s, 0, sizeof(struct sysblk_scan)); + s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); +} + +static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + int i, nr_sysblk = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] != NVM_BLK_T_HOST) + continue; + + if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) { + pr_err("nvm: too many host blks\n"); + return -EINVAL; + } + + ppa.g.blk = i; + + s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa; + s->nr_ppas++; + nr_sysblk++; + } + + return 0; +} + +static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *ppas, nvm_bb_update_fn *fn) +{ + struct ppa_addr dppa; + int i, ret; + + s->nr_ppas = 0; + + for (i = 0; i < s->nr_rows; i++) { + dppa = generic_to_dev_addr(dev, ppas[i]); + s->row = i; + + ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s); + if (ret) { + pr_err("nvm: failed bb tbl for ppa (%u %u)\n", + ppas[i].g.ch, + ppas[i].g.blk); + return ret; + } + } + + return ret; +} + +/* + * scans a block for latest sysblk. + * Returns: + * 0 - newer sysblk not found. PPA is updated to latest page. + * 1 - newer sysblk found and stored in *cur. PPA is updated to + * next valid page. + * <0- error. + */ +static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, + struct nvm_system_block *sblk) +{ + struct nvm_system_block *cur; + int pg, cursz, ret, found = 0; + + /* the full buffer for a flash page is allocated. Only the first of it + * contains the system block information + */ + cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + cur = kmalloc(cursz, GFP_KERNEL); + if (!cur) + return -ENOMEM; + + /* perform linear scan through the block */ + for (pg = 0; pg < dev->lps_per_blk; pg++) { + ppa->g.pg = ppa_to_slc(dev, pg); + + ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, + cur, cursz); + if (ret) { + if (ret == NVM_RSP_ERR_EMPTYPAGE) { + pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; + } + pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)", + ret, + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* if we can't read a page, continue to the + * next blk + */ + } + + if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) { + pr_debug("nvm: scan break for ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* last valid page already found */ + } + + if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr)) + continue; + + memcpy(sblk, cur, sizeof(struct nvm_system_block)); + found = 1; + } + + kfree(cur); + + return found; +} + +static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) +{ + struct nvm_rq rqd; + int ret; + + if (s->nr_ppas > dev->ops->max_phys_sect) { + pr_err("nvm: unable to update all sysblocks atomically\n"); + return -EINVAL; + } + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas); + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->set_bb_tbl(dev, &rqd, type); + nvm_free_rqd_ppalist(dev, &rqd); + if (ret) { + pr_err("nvm: sysblk failed bb mark\n"); + return -EINVAL; + } + + return 0; +} + +static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + struct ppa_addr *sppa; + int i, blkid = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] == NVM_BLK_T_HOST) + return -EEXIST; + + if (blks[i] != NVM_BLK_T_FREE) + continue; + + sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; + sppa->g.ch = ppa.g.ch; + sppa->g.lun = ppa.g.lun; + sppa->g.blk = i; + s->nr_ppas++; + blkid++; + + pr_debug("nvm: use (%u %u %u) as sysblk\n", + sppa->g.ch, sppa->g.lun, sppa->g.blk); + if (blkid > MAX_BLKS_PR_SYSBLK - 1) + return 0; + } + + pr_err("nvm: sysblk failed get sysblk\n"); + return -EINVAL; +} + +static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, + struct sysblk_scan *s) +{ + struct nvm_system_block nvmsb; + void *buf; + int i, sect, ret, bufsz; + struct ppa_addr *ppas; + + nvm_cpu_to_sysblk(&nvmsb, info); + + /* buffer for flash page */ + bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); + + ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) { + ret = -ENOMEM; + goto err; + } + + /* Write and verify */ + for (i = 0; i < s->nr_rows; i++) { + ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])]; + + pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk, + ppas[0].g.pg); + + /* Expand to all sectors within a flash page */ + if (dev->sec_per_pg > 1) { + for (sect = 1; sect < dev->sec_per_pg; sect++) { + ppas[sect].ppa = ppas[0].ppa; + ppas[sect].g.sec = sect; + } + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed program (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed read (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) { + pr_err("nvm: sysblk failed verify (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + ret = -EINVAL; + break; + } + } + + kfree(ppas); +err: + kfree(buf); + + return ret; +} + +static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s) +{ + int i, ret; + unsigned long nxt_blk; + struct ppa_addr *ppa; + + for (i = 0; i < s->nr_rows; i++) { + nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK; + ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)]; + ppa->g.pg = ppa_to_slc(dev, 0); + + ret = nvm_erase_ppa(dev, ppa, 1); + if (ret) + return ret; + + s->act_blk[i] = nxt_blk; + } + + return 0; +} + +int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, found = 0; + int ret = -ENOMEM; + + /* + * 1. setup sysblk locations + * 2. get bad block list + * 3. filter on host-specific (type 3) + * 4. iterate through all and find the highest seq nr. + * 5. return superblock information + */ + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + /* no sysblocks initialized */ + if (!s.nr_ppas) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* find the latest block across all sysblocks */ + for (i = 0; i < s.nr_rows; i++) { + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)]; + + ret = nvm_scan_block(dev, &ppa, cur); + if (ret > 0) + found = 1; + else if (ret < 0) + break; + } + } + + nvm_sysblk_to_cpu(info, cur); + + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + if (found) + return 1; + return ret; +} + +int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) +{ + /* 1. for each latest superblock + * 2. if room + * a. write new flash page entry with the updated information + * 3. if no room + * a. find next available block on lun (linear search) + * if none, continue to next lun + * if none at all, report error. also report that it wasn't + * possible to write to all superblocks. + * c. write data to block. + */ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, ppaidx, found = 0; + int ret = -ENOMEM; + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* Get the latest sysblk for each sysblk row */ + for (i = 0; i < s.nr_rows; i++) { + found = 0; + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + ppaidx = scan_ppa_idx(i, j); + ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur); + if (ret > 0) { + s.act_blk[i] = j; + found = 1; + } else if (ret < 0) + break; + } + } + + if (!found) { + pr_err("nvm: no valid sysblks found to update\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * All sysblocks found. Check that they have same page id in their flash + * blocks + */ + for (i = 1; i < s.nr_rows; i++) { + struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])]; + struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])]; + + if (l.g.pg != r.g.pg) { + pr_err("nvm: sysblks not on same page. Previous update failed.\n"); + ret = -EINVAL; + goto err_cur; + } + } + + /* + * Check that there haven't been another update to the seqnr since we + * began + */ + if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) { + pr_err("nvm: seq is not sequential\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * When all pages in a block has been written, a new block is selected + * and writing is performed on the new block. + */ + if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg == + dev->lps_per_blk - 1) { + ret = nvm_prepare_new_sysblks(dev, &s); + if (ret) + goto err_cur; + } + + ret = nvm_write_and_verify(dev, new, &s); +err_cur: + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + return ret; +} + +int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + int ret; + + /* + * 1. select master blocks and select first available blks + * 2. get bad block list + * 3. mark MAX_SYSBLKS block as host-based device allocated. + * 4. write and verify data to block + */ + + if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl) + return -EINVAL; + + if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) { + pr_err("nvm: memory does not support SLC access\n"); + return -EINVAL; + } + + /* Index all sysblocks and mark them as host-driven */ + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks); + if (ret) + goto err_mark; + + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST); + if (ret) + goto err_mark; + + /* Write to the first block of each row */ + ret = nvm_write_and_verify(dev, info, &s); +err_mark: + mutex_unlock(&dev->mlock); + return ret; +} diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 678fd91a1f99..7ad22d3f0d2e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -17,6 +17,7 @@ enum { #include #include #include +#include enum { /* HW Responsibilities */ @@ -281,6 +282,15 @@ struct nvm_block { int state; }; +/* system block cpu representation */ +struct nvm_sb_info { + unsigned long seqnr; + unsigned long erase_cnt; + unsigned int version; + char mmtype[NVM_MMTYPE_LEN]; + struct ppa_addr fs_ppa; +}; + struct nvm_dev { struct nvm_dev_ops *ops; @@ -329,6 +339,8 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; char name[DISK_NAME_LEN]; + + struct mutex mlock; }; static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, @@ -394,6 +406,11 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } +static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) +{ + return dev->lptbl[slc_pg]; +} + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); @@ -489,6 +506,24 @@ extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); + +/* sysblk.c */ +#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ + +/* system block on disk representation */ +struct nvm_system_block { + __be32 magic; /* magic signature */ + __be32 seqnr; /* sequence number */ + __be32 erase_cnt; /* erase count */ + __be16 version; /* version number */ + u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ + __be64 fs_ppa; /* PPA for media manager + * superblock */ +}; + +extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 928f98997d8a..0171b85e7efc 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -33,6 +33,7 @@ #define NVM_TTYPE_NAME_MAX 48 #define NVM_TTYPE_MAX 63 +#define NVM_MMTYPE_LEN 8 #define NVM_CTRL_FILE "/dev/lightnvm/control" -- cgit v1.2.3 From 5569615424613aa006005f18b03a3a12738a47d7 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 12 Jan 2016 07:49:37 +0100 Subject: lightnvm: introduce ioctl to initialize device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the previous patch, we now introduce an ioctl to initialize the device using nvm_init_sysblock and create the necessary system blocks. The user may specify the media manager that they wish to instantiate on top. Default from user-space will be "gennvm". Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 50 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/lightnvm.h | 11 ++++++++++ 2 files changed, 61 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 73b8ae1dafc4..ee08fac6f327 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1019,6 +1019,54 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return __nvm_configure_remove(&remove); } +static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) +{ + info->seqnr = 1; + info->erase_cnt = 0; + info->version = 1; +} + +static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) +{ + struct nvm_dev *dev; + struct nvm_sb_info info; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(init->dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + nvm_setup_nvm_sb_info(&info); + + strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); + info.fs_ppa.ppa = -1; + + return nvm_init_sysblock(dev, &info); +} + +static long nvm_ioctl_dev_init(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_init init; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) + return -EFAULT; + + if (init.flags != 0) { + pr_err("nvm: no flags supported\n"); + return -EINVAL; + } + + init.dev[DISK_NAME_LEN - 1] = '\0'; + + return __nvm_ioctl_dev_init(&init); +} + static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -1032,6 +1080,8 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) return nvm_ioctl_dev_create(file, argp); case NVM_DEV_REMOVE: return nvm_ioctl_dev_remove(file, argp); + case NVM_DEV_INIT: + return nvm_ioctl_dev_init(file, argp); } return 0; } diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 0171b85e7efc..56339e2e0a46 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -101,6 +101,12 @@ struct nvm_ioctl_remove { __u32 flags; }; +struct nvm_ioctl_dev_init { + char dev[DISK_NAME_LEN]; /* open-channel SSD device */ + char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */ + + __u32 flags; +}; /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { @@ -111,6 +117,9 @@ enum { /* device level cmds */ NVM_DEV_CREATE_CMD, NVM_DEV_REMOVE_CMD, + + /* Init a device to support LightNVM media managers */ + NVM_DEV_INIT_CMD, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -123,6 +132,8 @@ enum { struct nvm_ioctl_create) #define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \ struct nvm_ioctl_remove) +#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ + struct nvm_ioctl_dev_init) #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 -- cgit v1.2.3 From 8b4970c41f88ad772771f87b1c82c395248a84d8 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 12 Jan 2016 07:49:39 +0100 Subject: lightnvm: introduce factory reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that a device can be managed using the system blocks, a method to reset the device is necessary as well. This patch introduces logic to reset the device easily to factory state and exposes it through an ioctl. The ioctl takes the following flags: NVM_FACTORY_ERASE_ONLY_USER By default all blocks, except host-reserved blocks are erased upon factory reset. Instead of this, only erase host-reserved blocks. NVM_FACTORY_RESET_HOST_BLKS Mark host-reserved blocks to be erased and set their type to free. NVM_FACTORY_RESET_GRWN_BBLKS Mark "grown bad blocks" to be erased and set their type to free. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 34 ++++++++ drivers/lightnvm/sysblk.c | 179 ++++++++++++++++++++++++++++++++++++++++++ include/linux/lightnvm.h | 2 + include/uapi/linux/lightnvm.h | 19 +++++ 4 files changed, 234 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 9e5712dda062..33224cb91c5b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1088,6 +1088,38 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) return __nvm_ioctl_dev_init(&init); } +static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_factory fact; + struct nvm_dev *dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) + return -EFAULT; + + fact.dev[DISK_NAME_LEN - 1] = '\0'; + + if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) + return -EINVAL; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(fact.dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + if (dev->mt) { + dev->mt->unregister_mgr(dev); + dev->mt = NULL; + } + + return nvm_dev_factory(dev, fact.flags); +} + static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -1103,6 +1135,8 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) return nvm_ioctl_dev_remove(file, argp); case NVM_DEV_INIT: return nvm_ioctl_dev_init(file, argp); + case NVM_DEV_FACTORY: + return nvm_ioctl_dev_factory(file, argp); } return 0; } diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index b8489f425b05..321de1f154c5 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -560,3 +560,182 @@ err_mark: mutex_unlock(&dev->mlock); return ret; } + +struct factory_blks { + struct nvm_dev *dev; + int flags; + unsigned long *blks; +}; + +static int factory_nblks(int nblks) +{ + /* Round up to nearest BITS_PER_LONG */ + return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); +} + +static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) +{ + int nblks = factory_nblks(dev->blks_per_lun); + + return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) / + BITS_PER_LONG; +} + +static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct factory_blks *f = private; + struct nvm_dev *dev = f->dev; + int i, lunoff; + + lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun); + + /* non-set bits correspond to the block must be erased */ + for (i = 0; i < nr_blks; i++) { + switch (blks[i]) { + case NVM_BLK_T_FREE: + if (f->flags & NVM_FACTORY_ERASE_ONLY_USER) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_HOST: + if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS)) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_GRWN_BAD: + if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS)) + set_bit(i, &f->blks[lunoff]); + break; + default: + set_bit(i, &f->blks[lunoff]); + break; + } + } + + return 0; +} + +static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, + int max_ppas, struct factory_blks *f) +{ + struct ppa_addr ppa; + int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; + unsigned long *offset; + + while (!done) { + done = 1; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + idx = factory_blk_offset(dev, ch, lun); + offset = &f->blks[idx]; + + blkid = find_first_zero_bit(offset, + dev->blks_per_lun); + if (blkid >= dev->blks_per_lun) + continue; + set_bit(blkid, offset); + + ppa.ppa = 0; + ppa.g.ch = ch; + ppa.g.lun = lun; + ppa.g.blk = blkid; + pr_debug("nvm: erase ppa (%u %u %u)\n", + ppa.g.ch, + ppa.g.lun, + ppa.g.blk); + + erase_list[ppa_cnt] = ppa; + ppa_cnt++; + done = 0; + + if (ppa_cnt == max_ppas) + return ppa_cnt; + } + } + } + + return ppa_cnt; +} + +static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, + nvm_bb_update_fn *fn, void *priv) +{ + struct ppa_addr dev_ppa; + int ret; + + dev_ppa = generic_to_dev_addr(dev, ppa); + + ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv); + if (ret) + pr_err("nvm: failed bb tbl for ch%u lun%u\n", + ppa.g.ch, ppa.g.blk); + return ret; +} + +static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) +{ + int ch, lun, ret; + struct ppa_addr ppa; + + ppa.ppa = 0; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + ppa.g.ch = ch; + ppa.g.lun = lun; + + ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, + f); + if (ret) + return ret; + } + } + + return 0; +} + +int nvm_dev_factory(struct nvm_dev *dev, int flags) +{ + struct factory_blks f; + struct ppa_addr *ppas; + int ppa_cnt, ret = -ENOMEM; + int max_ppas = dev->ops->max_phys_sect / dev->nr_planes; + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + + f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns, + GFP_KERNEL); + if (!f.blks) + return ret; + + ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) + goto err_blks; + + f.dev = dev; + f.flags = flags; + + /* create list of blks to be erased */ + ret = nvm_fact_select_blks(dev, &f); + if (ret) + goto err_ppas; + + /* continue to erase until list of blks until empty */ + while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0) + nvm_erase_ppa(dev, ppas, ppa_cnt); + + /* mark host reserved blocks free */ + if (flags & NVM_FACTORY_RESET_HOST_BLKS) { + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, + sysblk_get_host_blks); + if (!ret) + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); + mutex_unlock(&dev->mlock); + } +err_ppas: + kfree(ppas); +err_blks: + kfree(f.blks); + return ret; +} +EXPORT_SYMBOL(nvm_dev_factory); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 02f36bdf216e..fc0e7c924967 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -527,6 +527,8 @@ struct nvm_system_block { extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); + +extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 56339e2e0a46..774a43128a7a 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -108,6 +108,20 @@ struct nvm_ioctl_dev_init { __u32 flags; }; +enum { + NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as + * host blks or grown blks */ + NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */ + NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */ + NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */ +}; + +struct nvm_ioctl_dev_factory { + char dev[DISK_NAME_LEN]; + + __u32 flags; +}; + /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { /* top level cmds */ @@ -120,6 +134,9 @@ enum { /* Init a device to support LightNVM media managers */ NVM_DEV_INIT_CMD, + + /* Factory reset device */ + NVM_DEV_FACTORY_CMD, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -134,6 +151,8 @@ enum { struct nvm_ioctl_remove) #define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ struct nvm_ioctl_dev_init) +#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ + struct nvm_ioctl_dev_factory) #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 -- cgit v1.2.3 From a9cf8284b45110a4d98aea180a89c857e53bf850 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sun, 10 Jan 2016 20:14:11 -0500 Subject: uapi: update install list after nvme.h rename Commit 9d99a8dda154 ("nvme: move hardware structures out of the uapi version of nvme.h") renamed nvme.h to nvme_ioctl.h, but the uapi list still refers to nvme.h. People trying to install the headers hit a failure as the header no longer exists. Cc: stable@vger.kernel.org Signed-off-by: Mike Frysinger Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 628e6e64c2fb..88e1292cb786 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -306,7 +306,7 @@ header-y += nfs_mount.h header-y += nl80211.h header-y += n_r3964.h header-y += nubus.h -header-y += nvme.h +header-y += nvme_ioctl.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h -- cgit v1.2.3 From df0108c5da561c66c333bb46bfe3c1fc65905898 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 20 Jan 2016 14:59:24 -0800 Subject: epoll: add EPOLLEXCLUSIVE flag Currently, epoll file descriptors or epfds (the fd returned from epoll_create[1]()) that are added to a shared wakeup source are always added in a non-exclusive manner. This means that when we have multiple epfds attached to a shared fd source they are all woken up. This creates thundering herd type behavior. Introduce a new 'EPOLLEXCLUSIVE' flag that can be passed as part of the 'event' argument during an epoll_ctl() EPOLL_CTL_ADD operation. This new flag allows for exclusive wakeups when there are multiple epfds attached to a shared fd event source. The implementation walks the list of exclusive waiters, and queues an event to each epfd, until it finds the first waiter that has threads blocked on it via epoll_wait(). The idea is to search for threads which are idle and ready to process the wakeup events. Thus, we queue an event to at least 1 epfd, but may still potentially queue an event to all epfds that are attached to the shared fd source. Performance testing was done by Madars Vitolins using a modified version of Enduro/X. The use of the 'EPOLLEXCLUSIVE' flag reduce the length of this particular workload from 860s down to 24s. Sample epoll_clt text: EPOLLEXCLUSIVE Sets an exclusive wakeup mode for the epfd file descriptor that is being attached to the target file descriptor, fd. Thus, when an event occurs and multiple epfd file descriptors are attached to the same target file using EPOLLEXCLUSIVE, one or more epfds will receive an event with epoll_wait(2). The default in this scenario (when EPOLLEXCLUSIVE is not set) is for all epfds to receive an event. EPOLLEXCLUSIVE may only be specified with the op EPOLL_CTL_ADD. Signed-off-by: Jason Baron Tested-by: Madars Vitolins Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Al Viro Cc: Michael Kerrisk Cc: Eric Wong Cc: Jonathan Corbet Cc: Andy Lutomirski Cc: Hagen Paul Pfeifer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 24 +++++++++++++++++++++--- include/uapi/linux/eventpoll.h | 3 +++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009cad8d5c..ae1dbcf47e97 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -92,7 +92,7 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1002,6 +1002,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + int ewake = 0; if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; @@ -1066,8 +1067,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ - if (waitqueue_active(&ep->wq)) + if (waitqueue_active(&ep->wq)) { + ewake = 1; wake_up_locked(&ep->wq); + } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1078,6 +1081,9 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); + if (epi->event.events & EPOLLEXCLUSIVE) + return ewake; + return 1; } @@ -1095,7 +1101,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; - add_wait_queue(whead, &pwq->wait); + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { @@ -1861,6 +1870,15 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; + /* + * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, + * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. + * Also, we do not currently supported nested exclusive wakeups. + */ + if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || + (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) + goto error_tgt_fput; + /* * At this point it is safe to assume that the "private_data" contains * our own data structure. diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2e1f0e..1c3154913a39 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,9 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Set exclusive wakeup mode for the target file descriptor */ +#define EPOLLEXCLUSIVE (1 << 28) + /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. -- cgit v1.2.3 From 9f4736fe7ca804aa79b5916221bb13dfc6221a0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:13:39 -0800 Subject: block: revert runtime dax control of the raw block device Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox Cc: Andrew Morton Cc: Ross Zwisler Signed-off-by: Dan Williams --- block/ioctl.c | 38 -------------------------------------- fs/block_dev.c | 28 ---------------------------- include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 4 files changed, 70 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/ioctl.c b/block/ioctl.c index 77f5d17779d6..d8996bbd7f12 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev) return true; } - -static int blkdev_daxset(struct block_device *bdev, unsigned long argp) -{ - unsigned long arg; - int rc = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (get_user(arg, (int __user *)(argp))) - return -EFAULT; - arg = !!arg; - if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) - return 0; - - if (arg) - arg = S_DAX; - - if (arg && !blkdev_dax_capable(bdev)) - return -ENOTTY; - - inode_lock(bdev->bd_inode); - if (bdev->bd_map_count == 0) - inode_set_flags(bdev->bd_inode, arg, S_DAX); - else - rc = -EBUSY; - inode_unlock(bdev->bd_inode); - return rc; -} -#else -static int blkdev_daxset(struct block_device *bdev, int arg) -{ - if (arg) - return -ENOTTY; - return 0; -} #endif static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, @@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); - case BLKDAXSET: - return blkdev_daxset(bdev, arg); case BLKDAXGET: return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); break; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7b9cd49622b1..afb437484362 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); } -static void blkdev_vm_open(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count++; - inode_unlock(bd_inode); -} - -static void blkdev_vm_close(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count--; - inode_unlock(bd_inode); -} - static const struct vm_operations_struct blkdev_dax_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, .pfn_mkwrite = blkdev_dax_fault, }; static const struct vm_operations_struct blkdev_default_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, }; @@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - inode_lock(bd_inode); - bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &blkdev_default_vm_ops; } - inode_unlock(bd_inode); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..149bec83a907 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -222,7 +222,6 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) -#define BLKDAXSET _IO(0x12,128) #define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ -- cgit v1.2.3