From c6153ed23ed6a2bb8a5891382c06134674610f86 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 16 Apr 2026 19:01:18 +0100 Subject: btrfs: add ioctl GET_CSUMS to read raw checksums from file range Add a new unprivileged BTRFS_IOC_GET_CSUMS ioctl, which can be used to query the on-disk csums for a file range. The ioctl is deliberately per-file rather than exposing raw csum tree lookups, to avoid leaking information to users about files they may not have access to. This is done by userspace passing a struct btrfs_ioctl_get_csums_args to the kernel, which details the offset and length we're interested in, and a buffer for the kernel to write its results into. The kernel writes a struct btrfs_ioctl_get_csums_entry into the buffer, followed by the csums if available. The maximum size of the user buffer is capped to 16MiB. If the extent is an uncompressed, non-NODATASUM extent, the kernel sets the entry type to BTRFS_GET_CSUMS_HAS_CSUMS and follows it with the csums. If it is sparse, preallocated, or beyond the EOF, it sets the type to BTRFS_GET_CSUMS_ZEROED - this is so userspace knows it can use the precomputed hash of the zero sector. Otherwise, it sets the type to BTRFS_GET_CSUMS_NODATASUM, BTRFS_GET_CSUMS_COMPRESSED, BTRFS_GET_CSUM_ENCRYPTED, or BTRFS_GET_CSUM_INLINE. For example, a file with a [0, 4K) hole and [4K, 12K) data extent would produce the following output buffer: | [0, 4K) ZEROED | [4K, 12K) HAS_CSUMS | csum data | We do store the csums of compressed extents, but we deliberately don't return them here: they're calculated over the compressed data, not the uncompressed data that's returned to userspace. Similarly for encrypted data, once encryption is supported, in which the csums will be on the ciphertext. The main use case for this is for speeding up mkfs.btrfs --rootdir. For the case when the source FS is btrfs and using the same csum algorithm, we can avoid having to recalculate the csums - in my synthetic benchmarks (16GB file on a spinning-rust drive), this resulted in a ~11% speed-up (218s to 196s). When using the --reflink option added in btrfs-progs v6.16.1, we can forgo reading the data entirely, resulting a ~2200% speed-up on the same test (128s to 6s). # mkdir rootdir # dd if=/dev/urandom of=rootdir/file bs=4096 count=4194304 (without ioctl) # echo 3 > /proc/sys/vm/drop_caches # time mkfs.btrfs --rootdir rootdir testimg ... real 3m37.965s user 0m5.496s sys 0m6.125s # echo 3 > /proc/sys/vm/drop_caches # time mkfs.btrfs --rootdir rootdir --reflink testimg ... real 2m8.342s user 0m5.472s sys 0m1.667s (with ioctl) # echo 3 > /proc/sys/vm/drop_caches # time mkfs.btrfs --rootdir rootdir testimg ... real 3m15.865s user 0m4.258s sys 0m6.261s # echo 3 > /proc/sys/vm/drop_caches # time mkfs.btrfs --rootdir rootdir --reflink testimg ... real 0m5.847s user 0m2.899s sys 0m0.097s Another notable use case is for deduplication, where reading the checksums may serve as a hint instead of reading the whole file data. Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9165154a274d..9b576603b3f1 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -1100,6 +1100,38 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }; +/* Flags for struct btrfs_ioctl_get_csums_entry::type. */ +#define BTRFS_GET_CSUMS_HAS_CSUMS (1U << 0) +#define BTRFS_GET_CSUMS_ZEROED (1U << 1) +#define BTRFS_GET_CSUMS_NODATASUM (1U << 2) +#define BTRFS_GET_CSUMS_COMPRESSED (1U << 3) +#define BTRFS_GET_CSUMS_ENCRYPTED (1U << 4) +#define BTRFS_GET_CSUMS_INLINE (1U << 5) + +struct btrfs_ioctl_get_csums_entry { + /* File offset of this range. */ + __u64 offset; + /* Length in bytes. */ + __u64 length; + /* One of BTRFS_GET_CSUMS_* types. */ + __u32 type; + /* Padding, must be 0. */ + __u32 reserved; +}; + +struct btrfs_ioctl_get_csums_args { + /* In/out: file offset in bytes. */ + __u64 offset; + /* In/out: range length in bytes. */ + __u64 length; + /* In/out: buffer capacity / bytes written. */ + __u64 buf_size; + /* In: flags, must be 0 for now. */ + __u64 flags; + /* Out: entries of type btrfs_ioctl_get_csums_entry + csum data */ + __u8 buf[]; +}; + /* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */ #define BTRFS_SHUTDOWN_FLAGS_DEFAULT 0x0 #define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH 0x1 @@ -1226,6 +1258,8 @@ enum btrfs_err_code { struct btrfs_ioctl_encoded_io_args) #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ struct btrfs_ioctl_subvol_wait) +#define BTRFS_IOC_GET_CSUMS _IOWR(BTRFS_IOCTL_MAGIC, 66, \ + struct btrfs_ioctl_get_csums_args) /* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */ #define BTRFS_IOC_SHUTDOWN _IOR('X', 125, __u32) -- cgit v1.2.3