diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 07:47:51 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 07:47:51 +0300 |
commit | 149c51f876322d9bfbd5e2d6ffae7aff3d794384 (patch) | |
tree | a61c7dd828356e307fca06fc66dbdbf9b109c18f /include/uapi | |
parent | 97971df811b8854882c0f6c6631e23ab8cdcc44f (diff) | |
parent | b7af0635c87ff78d6bd523298ab7471f9ffd3ce5 (diff) | |
download | linux-149c51f876322d9bfbd5e2d6ffae7aff3d794384.tar.xz |
Merge tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"This round there are a lot of cleanups and moved code so the diffstat
looks huge, otherwise there are some nice performance improvements and
an update to raid56 reliability.
User visible features:
- raid56 reliability vs performance trade off:
- fix destructive RMW for raid5 data (raid6 still needs work): do
full checksum verification for all data during RMW cycle, this
should prevent rewriting potentially corrupted data without
notice
- stripes are cached in memory which should reduce the performance
impact but still can hurt some workloads
- checksums are verified after repair again
- this is the last option without introducing additional features
(write intent bitmap, journal, another tree), the extra checksum
read/verification was supposed to be avoided by the original
implementation exactly for performance reasons but that caused
all the reliability problems
- discard=async by default for devices that support it
- implement emergency flush reserve to avoid almost all unnecessary
transaction aborts due to ENOSPC in cases where there are too many
delayed refs or delayed allocation
- skip block group synchronization if there's no change in used
bytes, can reduce transaction commit count for some workloads
Performance improvements:
- fiemap and lseek:
- overall speedup due to skipping unnecessary or duplicate
searches (-40% run time)
- cache some data structures and sharedness of extents (-30% run
time)
- send:
- faster backref resolution when finding clones
- cached leaf to root mapping for faster backref walking
- improved clone/sharing detection
- overall run time improvements (-70%)
Core:
- module initialization converted to a table of function pointers run
in a sequence
- preparation for fscrypt, extend passing file names across calls,
dir item can store encryption status
- raid56 updates:
- more accurate error tracking of sectors within stripe
- simplify recovery path and remove dedicated endio worker kthread
- simplify scrub call paths
- refactoring to support the extra data checksum verification
during RMW cycle
- tree block parentness checks consolidated and done at metadata read
time
- improved error handling
- cleanups:
- move a lot of code for better synchronization between kernel and
user space sources, split big files
- enum cleanups
- GFP flag cleanups
- header file cleanups, prototypes, dependencies
- redundant parameter cleanups
- inline extent handling simplifications
- inode parameter conversion
- data structure cleanups, reductions, renames, merges"
* tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (249 commits)
btrfs: print transaction aborted messages with an error level
btrfs: sync some cleanups from progs into uapi/btrfs.h
btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range
btrfs: fix extent map use-after-free when handling missing device in read_one_chunk
btrfs: remove outdated logic from overwrite_item() and add assertion
btrfs: unify overwrite_item() and do_overwrite_item()
btrfs: replace strncpy() with strscpy()
btrfs: fix uninitialized variable in find_first_clear_extent_bit
btrfs: fix uninitialized parent in insert_state
btrfs: add might_sleep() annotations
btrfs: add stack helpers for a few btrfs items
btrfs: add nr_global_roots to the super block definition
btrfs: remove BTRFS_LEAF_DATA_OFFSET
btrfs: add helpers for manipulating leaf items and data
btrfs: add eb to btrfs_node_key_ptr_offset
btrfs: pass the extent buffer for the btrfs_item_nr helpers
btrfs: move the csum helpers into ctree.h
btrfs: move eb offset helpers into extent_io.h
btrfs: move file_extent_item helpers into file-item.h
btrfs: move leaf_data_end into ctree.c
...
Diffstat (limited to 'include/uapi')
-rw-r--r-- | include/uapi/linux/btrfs.h | 36 | ||||
-rw-r--r-- | include/uapi/linux/btrfs_tree.h | 235 |
2 files changed, 265 insertions, 6 deletions
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5655e89b962b..b4f0f9531119 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -19,8 +19,14 @@ #ifndef _UAPI_LINUX_BTRFS_H #define _UAPI_LINUX_BTRFS_H + +#ifdef __cplusplus +extern "C" { +#endif + #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/fs.h> #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 @@ -333,6 +339,12 @@ struct btrfs_ioctl_feature_flags { */ struct btrfs_balance_args { __u64 profiles; + + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ union { __u64 usage; struct { @@ -549,7 +561,7 @@ struct btrfs_ioctl_search_header { __u64 offset; __u32 type; __u32 len; -}; +} __attribute__ ((__may_alias__)); #define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) /* @@ -562,6 +574,10 @@ struct btrfs_ioctl_search_args { char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; }; +/* + * Extended version of TREE_SEARCH ioctl that can return more than 4k of bytes. + * The allocated size of the buffer is set in buf_size. + */ struct btrfs_ioctl_search_args_v2 { struct btrfs_ioctl_search_key key; /* in/out - search parameters */ __u64 buf_size; /* in - size of buffer @@ -570,10 +586,11 @@ struct btrfs_ioctl_search_args_v2 { __u64 buf[]; /* out - found items */ }; +/* With a @src_length of zero, the range from @src_offset->EOF is cloned! */ struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; }; /* @@ -677,8 +694,11 @@ struct btrfs_ioctl_logical_ino_args { /* struct btrfs_data_container *inodes; out */ __u64 inodes; }; -/* Return every ref to the extent, not just those containing logical block. - * Requires logical == extent bytenr. */ + +/* + * Return every ref to the extent, not just those containing logical block. + * Requires logical == extent bytenr. + */ #define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0) enum btrfs_dev_stat_values { @@ -1144,4 +1164,8 @@ enum btrfs_err_code { #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args) +#ifdef __cplusplus +} +#endif + #endif /* _UAPI_LINUX_BTRFS_H */ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 1f7a38ec6ac3..ab38d0f411fa 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -10,6 +10,23 @@ #include <stddef.h> #endif +/* ASCII for _BHRfS_M, no terminating nul */ +#define BTRFS_MAGIC 0x4D5F53665248425FULL + +#define BTRFS_MAX_LEVEL 8 + +/* + * We can actually store much bigger names, but lets not confuse the rest of + * linux. + */ +#define BTRFS_NAME_LEN 255 + +/* + * Theoretical limit is larger, but we keep this down to a sane value. That + * should limit greatly the possibility of collisions on inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* * This header contains the structure definitions and constants used * by file system objects that can be retrieved using @@ -359,6 +376,50 @@ enum btrfs_csum_type { #define BTRFS_FT_SYMLINK 7 #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* Directory contains encrypted data */ +#define BTRFS_FT_ENCRYPTED 0x80 + +static inline __u8 btrfs_dir_flags_to_ftype(__u8 flags) +{ + return flags & ~BTRFS_FT_ENCRYPTED; +} + +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) + +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + +#define BTRFS_INODE_RO_VERITY (1U << 0) + +#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) /* * The key defines the order in the tree, and so it also defines (optimal) @@ -389,6 +450,109 @@ struct btrfs_key { __u64 offset; } __attribute__ ((__packed__)); +/* + * Every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* These first four must match the super block */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific uuid */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* Which block this node is supposed to live in */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the super from here on down */ + __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + __u8 level; +} __attribute__ ((__packed__)); + +/* + * This is a very generous portion of the super block, giving us room to + * translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 + +/* + * Just in case we somehow lose the roots and are not able to mount, we store + * an array of the roots from previous transactions in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unused_64[4]; + + __u8 tree_root_level; + __u8 chunk_root_level; + __u8 extent_root_level; + __u8 fs_root_level; + __u8 dev_root_level; + __u8 csum_root_level; + /* future and to align */ + __u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* + * A leaf is full of items. offset and size tell us where to find the item in + * the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * Leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together during + * searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * All non-leaf blocks are nodes, they hold only keys and pointers to other + * blocks. + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; @@ -472,6 +636,69 @@ struct btrfs_chunk { /* additional stripes go here */ } __attribute__ ((__packed__)); +/* + * The super block basically lists the main trees of the FS. + */ +struct btrfs_super_block { + /* The first 4 fields must match struct btrfs_header */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* This block number */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 __unused_leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + __u8 root_level; + __u8 chunk_root_level; + __u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + __le64 cache_generation; + __le64 uuid_tree_generation; + + /* The UUID written into btree blocks */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; + + __u64 nr_global_roots; + + /* Future expansion */ + __le64 reserved[27]; + __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + __u8 padding[565]; +} __attribute__ ((__packed__)); + #define BTRFS_FREE_SPACE_EXTENT 1 #define BTRFS_FREE_SPACE_BITMAP 2 @@ -526,6 +753,14 @@ struct btrfs_extent_item_v0 { /* use full backrefs for extent pointers in the block */ #define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + /* * this flag is only used internally by scrub and may be changed at any time * it is only declared here to avoid collisions |