diff options
Diffstat (limited to 'fs/btrfs')
136 files changed, 19312 insertions, 16609 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a25c9910d90b..4fb925e8c981 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DEPRECATED)" - depends on BTRFS_FS - help - This feature has been deprecated and will be removed in 6.7. - - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N - config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 90d53209755b..87617f2968bc 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ - lru_cache.o + lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 206cf1612c1d..e3716516ca38 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -3,9 +3,10 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "messages.h" -#include "ctree.h" +#include "extent_io.h" +#include "fs.h" #include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, @@ -27,7 +28,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb, void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; - token->kaddr = page_address(eb->pages[0]); + token->kaddr = folio_address(eb->folios[0]); token->offset = 0; } @@ -50,7 +51,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * an offset into the extent buffer page array, cast to a specific type. This * gives us all the type checking. * - * The extent buffer pages stored in the array pages do not form a contiguous + * The extent buffer pages stored in the array folios may not form a contiguous * phyusical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ @@ -60,28 +61,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - return get_unaligned_le##bits(token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + return get_unaligned_le##bits(token->kaddr + oil); \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ - return get_unaligned_le##bits(token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(token->kaddr + oil); \ \ - memcpy(lebytes, token->kaddr + oip, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes, token->kaddr + oil, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(lebytes + part, token->kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -89,19 +92,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = eb->folio_size; \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ - return get_unaligned_le##bits(kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(kaddr + oil); \ \ - memcpy(lebytes, kaddr + oip, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes, kaddr + oil, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(lebytes + part, kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -110,53 +115,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oip, lebytes, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr + oil, lebytes, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(token->kaddr, lebytes + part, size - part); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = eb->folio_size; \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, kaddr + oil); \ return; \ } \ \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oip, lebytes, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr + oil, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(kaddr, lebytes + part, size - part); \ } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 8cfc8214109c..7a7e0ef69973 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,7 +3,17 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include <linux/unaligned.h> #include <linux/stddef.h> +#include <linux/types.h> +#include <linux/align.h> +#include <linux/build_bug.h> +#include <linux/compiler.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; @@ -24,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e static inline u8 get_unaligned_le8(const void *p) { - return *(u8 *)p; + return *(const u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) @@ -38,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p) offsetof(type, member), \ sizeof_field(type, member))) -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ +#define write_eb_member(eb, ptr, type, member, source) ( \ + write_extent_buffer(eb, (const char *)(source), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) @@ -89,14 +99,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]) + \ + const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } @@ -305,6 +315,11 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); + /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, @@ -335,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb, static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } @@ -349,6 +364,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); +BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, + root_id, 64); + BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, @@ -365,6 +383,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); return 0; } @@ -423,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) + const struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; @@ -489,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb, } static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) + const struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); @@ -830,45 +850,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb, write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } -static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void btrfs_cpu_balance_args_to_disk( - struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); @@ -966,6 +947,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, + enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7427449a04a3..e0ba00d64ea0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -12,7 +12,6 @@ #include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" -#include "btrfs_inode.h" #include "xattr.h" #include "acl.h" diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index a270e71ec05f..48b9ddae4a46 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,8 +3,15 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +struct posix_acl; +struct inode; +struct btrfs_trans_handle; + #ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct mnt_idmap; +struct dentry; + struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); @@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, #else +#include <linux/errno.h> + +struct btrfs_trans_handle; + #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ce083e99ef68..361a866c1995 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -9,8 +9,8 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <trace/events/btrfs.h> #include "async-thread.h" -#include "ctree.h" enum { WORK_DONE_BIT, @@ -242,7 +242,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, break; trace_btrfs_ordered_sched(work); spin_unlock_irqrestore(lock, flags); - work->ordered_func(work); + work->ordered_func(work, false); /* now take the lock again and drop our item from the list */ spin_lock_irqsave(lock, flags); @@ -277,7 +277,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, * We don't want to call the ordered free functions with * the lock held. */ - work->ordered_free(work); + work->ordered_func(work, true); /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } @@ -285,7 +285,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_unlock_irqrestore(lock, flags); if (free_self) { - self->ordered_free(self); + self->ordered_func(self, true); /* NB: self must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, self); } @@ -300,7 +300,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) /* * We should not touch things inside work in the following cases: - * 1) after work->func() if it has no ordered_free + * 1) after work->func() if it has no ordered_func(..., true) to free * Since the struct is freed in work->func(). * 2) after setting WORK_DONE_BIT * The work may be freed in other threads almost instantly. @@ -329,11 +329,10 @@ static void btrfs_work_helper(struct work_struct *normal_work) } void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free) + btrfs_ordered_func_t ordered_func) { work->func = func; work->ordered_func = ordered_func; - work->ordered_free = ordered_free; INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 30f66c5e2e6e..04c2f3175828 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -7,17 +7,20 @@ #ifndef BTRFS_ASYNC_THREAD_H #define BTRFS_ASYNC_THREAD_H +#include <linux/compiler_types.h> #include <linux/workqueue.h> +#include <linux/list.h> struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; + typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); struct btrfs_work { btrfs_func_t func; - btrfs_func_t ordered_func; - btrfs_func_t ordered_free; + btrfs_ordered_func_t ordered_func; /* Don't touch things below */ struct work_struct normal_work; @@ -35,7 +38,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( struct btrfs_fs_info *fs_info, const char *name, unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free); + btrfs_ordered_func_t ordered_func); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a2ba1c7fc16a..f8e1d5b2c512 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct prelim_ref), - 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; @@ -222,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -254,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -264,7 +261,7 @@ static void update_share_count(struct share_check *sc, int oldcount, else if (oldcount < 1 && newcount > 0) sc->share_count++; - if (newref->root_id == sc->root->root_key.objectid && + if (newref->root_id == btrfs_root_id(sc->root) && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; @@ -772,7 +769,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, continue; } - if (sc && ref->root_id != sc->root->root_key.objectid) { + if (sc && ref->root_id != btrfs_root_id(sc->root)) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -922,40 +919,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; struct btrfs_key *key_ptr = NULL; + /* The owner of a tree block ref is the level. */ + int level = btrfs_delayed_ref_owner(node); if (head->extent_op && head->extent_op->update_key) { btrfs_disk_key_to_cpu(&key, &head->extent_op->key); key_ptr = &key; } - ref = btrfs_delayed_node_to_tree_ref(node); - ret = add_indirect_ref(fs_info, preftrees, ref->root, - key_ptr, ref->level + 1, - node->bytenr, count, sc, - GFP_ATOMIC); + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, + key_ptr, level + 1, node->bytenr, + count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { - /* SHARED DIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); + /* + * SHARED DIRECT METADATA backref + * + * The owner of a tree block ref is the level. + */ + int level = btrfs_delayed_ref_owner(node); - ret = add_direct_ref(fs_info, preftrees, ref->level + 1, - ref->parent, node->bytenr, count, + ret = add_direct_ref(fs_info, preftrees, level + 1, + node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; + key.objectid = btrfs_delayed_ref_owner(node); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; + key.offset = btrfs_delayed_ref_offset(node); /* * If we have a share check context and a reference for @@ -975,18 +970,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, if (sc && count < 0) sc->have_delayed_delete_refs = true; - ret = add_indirect_ref(fs_info, preftrees, ref->root, + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, &key, 0, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ - struct btrfs_delayed_data_ref *ref; - - ref = btrfs_delayed_node_to_data_ref(node); - - ret = add_direct_ref(fs_info, preftrees, 0, ref->parent, + ret = add_direct_ref(fs_info, preftrees, 0, node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -1036,8 +1027,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { @@ -1129,6 +1118,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, count, sc, GFP_NOFS); break; } + case BTRFS_EXTENT_OWNER_REF_KEY: + ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); + break; default: WARN_ON(1); } @@ -1432,8 +1424,10 @@ again: if (ret < 0) goto out; if (ret == 0) { - /* This shouldn't happen, indicates a bug or fs corruption. */ - ASSERT(ret != 0); + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto out; } @@ -2222,6 +2216,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { @@ -2244,7 +2245,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2623,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, - fs_root->root_key.objectid); + btrfs_root_id(fs_root)); ret = inode_to_path(parent, name_len, (unsigned long)(iref + 1), eb, ipath); if (ret) @@ -2841,6 +2841,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf return ret; } +static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; @@ -2859,6 +2869,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) if (ret < 0) return ret; if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto release; } @@ -2929,6 +2943,14 @@ release: return ret; } +static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. @@ -2941,7 +2963,7 @@ release: */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { - struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; @@ -2992,7 +3014,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc) + struct btrfs_backref_cache *cache, bool is_reloc) { int i; @@ -3029,6 +3051,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( return node; } +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { @@ -3040,6 +3075,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( return edge; } +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. @@ -3115,6 +3196,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} /* * Handle direct tree backref * @@ -3265,7 +3359,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->root_key.objectid, + cur->bytenr, level - 1, btrfs_root_id(root), tree_key->objectid, tree_key->type, tree_key->offset); btrfs_put_root(root); ret = -ENOENT; @@ -3423,7 +3517,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, int type; cond_resched(); - eb = btrfs_backref_get_eb(iter); + eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 71d535e03dca..7dfcc9351bce 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -6,11 +6,23 @@ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H -#include <linux/btrfs.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "ulist.h" +#include "locking.h" #include "disk-io.h" #include "extent_io.h" +#include "ctree.h" + +struct extent_inode_elem; +struct ulist; +struct btrfs_extent_item; +struct btrfs_trans_handle; +struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to @@ -247,7 +259,7 @@ struct prelim_ref { struct rb_node rbnode; u64 root_id; struct btrfs_key key_for_search; - int level; + u8 level; int count; struct extent_inode_elem *inode_list; u64 parent; @@ -271,22 +283,6 @@ struct btrfs_backref_iter { struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); -static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) -{ - if (!iter) - return; - btrfs_free_path(iter->path); - kfree(iter); -} - -static inline struct extent_buffer *btrfs_backref_get_eb( - struct btrfs_backref_iter *iter) -{ - if (!iter) - return NULL; - return iter->path->nodes[0]; -} - /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. @@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); -static inline bool btrfs_backref_iter_is_inline_ref( - struct btrfs_backref_iter *iter) -{ - if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || - iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) - return true; - return false; -} - -static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) -{ - iter->bytenr = 0; - iter->item_ptr = 0; - iter->cur_ptr = 0; - iter->end_ptr = 0; - btrfs_release_path(iter->path); - memset(&iter->cur_key, 0, sizeof(iter->cur_key)); -} - /* * Backref cache related structures * @@ -440,102 +417,41 @@ struct btrfs_backref_cache { * Reloction backref cache require more info for reloc root compared * to generic backref cache. */ - unsigned int is_reloc; + bool is_reloc; }; void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc); + struct btrfs_backref_cache *cache, bool is_reloc); struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level); struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) -static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) -{ - ASSERT(upper && lower && upper->level == lower->level + 1); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); -} - -static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node) -{ - if (node) { - ASSERT(list_empty(&node->list)); - ASSERT(list_empty(&node->lower)); - ASSERT(node->eb == NULL); - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} - -static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, - struct btrfs_backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static inline void btrfs_backref_unlock_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} - -static inline void btrfs_backref_drop_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->eb) { - btrfs_backref_unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} +#define LINK_LOWER (1U << 0) +#define LINK_UPPER (1U << 1) -/* - * Drop the backref node from cache without cleaning up its children - * edges. - * - * This can only be called on node without parent edges. - * The children edges are still kept as is. - */ -static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, - struct btrfs_backref_node *node) -{ - ASSERT(list_empty(&node->upper)); - - btrfs_backref_drop_node_buffer(node); - list_del_init(&node->list); - list_del_init(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - btrfs_backref_free_node(tree, node); -} +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which); +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge); +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, - u64 bytenr, int errno) + u64 bytenr, int error) { - btrfs_panic(fs_info, errno, + btrfs_panic(fs_info, error, "Inconsistency in backref cache found at offset %llu", bytenr); } diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 650972895652..5fd0b39d8c70 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -10,11 +10,10 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" -#include "rcu-string.h" #include "zoned.h" #include "file-item.h" +#include "raid-stripe-tree.h" static struct bio_set btrfs_bioset; static struct bio_set btrfs_clone_bioset; @@ -30,7 +29,7 @@ struct btrfs_failed_bio { /* Is this a data path I/O that needs storage layer checksum and repair? */ static inline bool is_data_bbio(struct btrfs_bio *bbio) { - return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); + return bbio->inode && is_data_inode(bbio->inode); } static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) @@ -50,11 +49,12 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -74,20 +74,16 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -124,43 +120,26 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) -{ if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -180,7 +159,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -194,6 +173,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; + /* + * We can only trigger this for data bio, which doesn't support larger + * folios yet. + */ + ASSERT(folio_order(page_folio(bv->bv_page)) == 0); + if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -206,7 +191,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -215,7 +200,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bv->bv_page, bv->bv_offset, mirror); + page_folio(bv->bv_page), bv->bv_offset, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -275,7 +260,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -321,7 +306,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -355,7 +340,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -373,9 +358,9 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -389,7 +374,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -416,7 +401,10 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - btrfs_orig_bbio_end_io(bbio); + if (bio_is_zone_append(bio) && !bio->bi_status) + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -427,6 +415,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); + } else if (bio_is_zone_append(bio)) { + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } /* Pass on control to the original bio this one was cloned from */ @@ -463,8 +453,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); - btrfsic_check_bio(bio); - if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -490,11 +478,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; bioc->stripes[dev_nr].bioc = bioc; + bioc->size = bio->bi_iter.bi_size; btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -568,16 +557,23 @@ static void run_one_async_start(struct btrfs_work *work) * * At IO completion time the csums attached on the ordered extent record are * inserted into the tree. + * + * If called with @do_free == true, then it will free the work struct. */ -static void run_one_async_done(struct btrfs_work *work) +static void run_one_async_done(struct btrfs_work *work, bool do_free) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); struct bio *bio = &async->bbio->bio; + if (do_free) { + kfree(container_of(work, struct async_submit_bio, work)); + return; + } + /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -587,18 +583,25 @@ static void run_one_async_done(struct btrfs_work *work) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - kfree(container_of(work, struct async_submit_bio, work)); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) { + bool auto_csum_mode = true; + +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); + + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) + return false; + + auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); +#endif + /* Submit synchronously if the checksum implementation is fast. */ - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* @@ -618,7 +621,7 @@ static bool should_async_write(struct btrfs_bio *bbio) /* * Submit bio to an async queue. * - * Return true if the work has been succesfuly submitted, else false. + * Return true if the work has been successfully submitted, else false. */ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, @@ -636,12 +639,30 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, async->smap = *smap; async->mirror_num = mirror_num; - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); btrfs_queue_work(fs_info->workers, &async->work); return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; @@ -656,20 +677,34 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; + btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); - goto fail; + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + ret = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -690,12 +725,24 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } + if (is_data_bbio(bbio) && bioc && + btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + /* + * No locking for the list update, as we only add to + * the list in the I/O submission path, and list + * iteration only happens in the completion path, which + * can't happen until after the last submission. + */ + btrfs_get_bioc(bioc); + list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); + } + /* * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) @@ -713,7 +760,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; @@ -729,16 +776,15 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - remaining->bio.bi_status = ret; - btrfs_orig_bbio_end_io(remaining); + btrfs_bio_end_io(remaining, ret); } - bbio->bio.bi_status = ret; - btrfs_orig_bbio_end_io(bbio); +end_bbio: + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -750,7 +796,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * @@ -758,8 +804,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -790,9 +836,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); + ret = bio_add_folio(&bio, folio, length, folio_offset); + ASSERT(ret); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -840,7 +885,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index ca79decee060..e2fe16074ad6 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -7,12 +7,14 @@ #ifndef BTRFS_BIO_H #define BTRFS_BIO_H +#include <linux/types.h> #include <linux/bio.h> #include <linux/workqueue.h> #include "tree-checker.h" struct btrfs_bio; struct btrfs_fs_info; +struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -27,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -40,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -77,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -102,10 +107,10 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 434cf3d5f4cf..7eef79ece5b3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -34,15 +34,28 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) } #endif +static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) +{ + /* The meta_write_pointer is available only on the zoned setup. */ + if (!btrfs_is_zoned(block_group->fs_info)) + return false; + + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return block_group->start + block_group->alloc_offset > + block_group->meta_write_pointer; +} + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -168,7 +181,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) cache); kfree(cache->free_space_ctl); - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); kfree(cache); } } @@ -418,7 +431,7 @@ struct btrfs_caching_control *btrfs_get_caching_control( return ctl; } -void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +static void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); @@ -935,7 +948,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); atomic_set(&caching_ctl->progress, 0); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -1022,6 +1035,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) } } +static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_block_group *block_group) @@ -1047,7 +1067,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) + struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; @@ -1059,11 +1079,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; + bool remove_map; bool remove_rsv = false; - block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!block_group); + block_group = btrfs_lookup_block_group(fs_info, map->start); + if (!block_group) + return -ENOENT; + BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); @@ -1240,6 +1262,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); + /* + * Hitting this WARN means we removed a block group with an unwritten + * region. It will cause "unable to find chunk map for logical" errors. + */ + if (WARN_ON(has_unwritten_metadata(block_group))) + btrfs_warn(fs_info, + "block group %llu is removed before metadata write out", + block_group->start); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* @@ -1252,7 +1283,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * - * And we must not remove the extent map from the fs_info->mapping_tree + * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. @@ -1268,25 +1299,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->frozen) == 0); + remove_map = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } + if (remove_map) + btrfs_remove_chunk_map(fs_info, map); out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); btrfs_free_path(path); return ret; } @@ -1295,15 +1318,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { struct btrfs_root *root = btrfs_block_group_root(fs_info); - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned int num_items; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + ASSERT(map != NULL); + ASSERT(map->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order @@ -1324,9 +1344,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = em->map_lookup; num_items = 3 + map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return btrfs_start_transaction_fallback_global_rsv(root, num_items); } @@ -1418,9 +1437,9 @@ out: } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1442,7 +1461,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed - * it, leading to a BUG_ON() at unpin_extent_range(). + * it, leading to an error at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { @@ -1535,6 +1554,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. + * + * Also bail out if this is the only block group for its + * type, because otherwise we would lose profile + * information from fs_info->avail_*_alloc_bits and the + * next block group of this type would be created with a + * "single" profile (even if we're in a raid fs) because + * fs_info->avail_*_alloc_bits would be 0. */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1563,8 +1589,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) { + if ((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1752,33 +1779,30 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); + u64 thresh_bytes = mult_perc(bg->length, thresh_pct); const u64 new_val = bg->used; const u64 old_val = new_val + bytes_freed; - u64 thresh; - if (reclaim_thresh == 0) + if (thresh_bytes == 0) return false; - thresh = mult_perc(bg->length, reclaim_thresh); - /* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups. */ - if (old_val < thresh) + if (old_val < thresh_bytes) return false; - if (new_val >= thresh) + if (new_val >= thresh_bytes) return false; return true; } @@ -1826,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; + u64 reclaimed; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1839,6 +1864,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* Don't race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + spin_lock(&space_info->lock); spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* @@ -1848,6 +1874,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * this block group. */ spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } @@ -1866,6 +1893,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_mark_bg_unused(bg); spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; @@ -1882,10 +1910,23 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!should_reclaim_block_group(bg, bg->length)) { spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); /* * Get out fast, in case we're read-only or unmounting the @@ -1900,13 +1941,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) @@ -1918,15 +1952,26 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); + reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); + reclaimed = 0; + spin_lock(&space_info->lock); + space_info->reclaim_errors++; + if (READ_ONCE(space_info->periodic_reclaim)) + space_info->periodic_reclaim_ready = false; + spin_unlock(&space_info->lock); } + spin_lock(&space_info->lock); + space_info->reclaim_count++; + space_info->reclaim_bytes += reclaimed; + spin_unlock(&space_info->lock); next: - if (ret) { + if (ret && !READ_ONCE(space_info->periodic_reclaim)) { /* Refcount held by the reclaim_bgs list after splice. */ spin_lock(&fs_info->unused_bgs_lock); /* @@ -1968,6 +2013,7 @@ end: void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) { + btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); @@ -1987,11 +2033,10 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot; @@ -2001,23 +2046,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, slot = path->slots[0]; leaf = path->nodes[0]; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, key->objectid, key->offset); - read_unlock(&em_tree->lock); - if (!em) { + map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); + if (!map) { btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", key->objectid, key->offset); return -ENOENT; } - if (em->start != key->objectid || em->len != key->offset) { + if (map->start != key->objectid || map->chunk_len != key->offset) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", - key->objectid, key->offset, em->start, em->len); + key->objectid, key->offset, map->start, map->chunk_len); ret = -EUCLEAN; - goto out_free_em; + goto out_free_map; } read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), @@ -2025,22 +2067,22 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type)); ret = -EUCLEAN; } -out_free_em: - free_extent_map(em); +out_free_map: + btrfs_free_chunk_map(map); return ret; } static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2087,8 +2129,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 *buf; u64 bytenr; u64 data_stripe_length; @@ -2096,14 +2137,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, int i, nr = 0; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(map)) return -EIO; - map = em->map_lookup; - data_stripe_length = em->orig_block_len; + data_stripe_length = map->stripe_size; io_stripe_size = BTRFS_STRIPE_LEN; - chunk_start = em->start; + chunk_start = map->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -2157,7 +2197,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, *naddrs = nr; *stripe_len = io_stripe_size; out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2262,49 +2302,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->lock); + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg; + /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to + * btrfs_find_chunk_map() will return the first chunk map + * intersecting the range, so setting @length to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) + map = btrfs_find_chunk_map(fs_info, start, 1); + if (!map) break; - bg = btrfs_lookup_block_group(fs_info, em->start); + bg = btrfs_lookup_block_group(fs_info, map->start); if (!bg) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); + map->start, map->chunk_len); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); break; } - if (bg->start != em->start || bg->length != em->len || + if (bg->start != map->start || bg->length != map->chunk_len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + map->start, map->chunk_len, + map->type & BTRFS_BLOCK_GROUP_TYPE_MASK, bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; @@ -2432,28 +2470,25 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct rb_node *node; int ret = 0; - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - struct extent_map *em; - struct map_lookup *map; + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - em = rb_entry(node, struct extent_map, rb_node); - map = em->map_lookup; - bg = btrfs_create_block_group_cache(fs_info, em->start); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + bg = btrfs_create_block_group_cache(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; } /* Fill dummy cache as FULL */ - bg->length = em->len; + bg->length = map->chunk_len; bg->flags = map->type; bg->cached = BTRFS_CACHE_FINISHED; - bg->used = em->len; + bg->used = map->chunk_len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); /* @@ -2632,8 +2667,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2681,19 +2716,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_offset; - u64 stripe_size; int i; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); - - map = em->map_lookup; - stripe_size = em->orig_block_len; + map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(map)) + return PTR_ERR(map); /* * Take the device list mutex to prevent races with the final phase of @@ -2710,13 +2740,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, dev_offset = map->stripes[i].physical; ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, - stripe_size); + map->stripe_size); if (ret) break; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2772,9 +2802,43 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * If the block group is still unused, add it to the list of + * unused block groups. The block group may have been created in + * order to satisfy a space reservation, in which case the + * extent allocation only happens later. But often we don't + * actually need to allocate space that we previously reserved, + * so the block group may become unused for a long time. For + * example for metadata we generally reserve space for a worst + * possible scenario, but then don't end up allocating all that + * space or none at all (due to no need to COW, extent buffers + * were already COWed in the current transaction and still + * unwritten, tree heights lower than the maximum possible + * height, etc). For data we generally reserve the axact amount + * of space we are going to allocate later, the exception is + * when using compression, as we must reserve space based on the + * uncompressed data size, because the compression is only done + * when writeback triggered and we don't know how much space we + * are actually going to need, so we reserve the uncompressed + * size because the data may be uncompressible in the worst case. + */ + if (ret == 0) { + bool used; + + spin_lock(&block_group->lock); + used = btrfs_is_block_group_used(block_group); + spin_unlock(&block_group->lock); + + if (!used) + btrfs_mark_bg_unused(block_group); + } } btrfs_trans_release_chunk_metadata(trans); } @@ -2783,7 +2847,7 @@ next: * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -2882,8 +2946,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #endif list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); + btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); return cache; @@ -2974,7 +3037,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, goto unlock_out; /* - * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * Skip chunk allocation if the bg is SYSTEM, this is to avoid system * chunk allocation storm to exhaust the system chunk array. Otherwise * we still want to try our best to mark the block group read-only. */ @@ -3116,7 +3179,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; @@ -3168,7 +3230,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the @@ -3435,7 +3497,7 @@ again: if (should_put) btrfs_put_block_group(cache); if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be @@ -3539,8 +3601,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; @@ -3583,7 +3644,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) /* If its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -3608,12 +3669,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group *cache = NULL; - u64 total = num_bytes; + struct btrfs_space_info *space_info; + struct btrfs_block_group *cache; u64 old_val; - u64 byte_in_group; + bool reclaim = false; + bool bg_already_dirty = true; int factor; - int ret = 0; /* Block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -3625,97 +3686,91 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_root_lock); - while (total) { - struct btrfs_space_info *space_info; - bool reclaim = false; - - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - space_info = cache->space_info; - factor = btrfs_bg_type_to_factor(cache->flags); + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -ENOENT; - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, true); + /* An extent can not span multiple block groups. */ + ASSERT(bytenr + num_bytes <= cache->start + cache->length); - byte_in_group = bytenr - cache->start; - WARN_ON(byte_in_group > cache->length); + space_info = cache->space_info; + factor = btrfs_bg_type_to_factor(cache->flags); - spin_lock(&space_info->lock); - spin_lock(&cache->lock); + /* + * If this block group has free space cache written out, we need to make + * sure to load it if we are removing space. This is because we need + * the unpinning stage to actually add the space back to the block group, + * otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, true); - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); - old_val = cache->used; - num_bytes = min(total, cache->length - byte_in_group); - if (alloc) { - old_val += num_bytes; - cache->used = old_val; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->bytes_used += num_bytes; - space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - old_val -= num_bytes; - cache->used = old_val; - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, - num_bytes); - space_info->bytes_used -= num_bytes; - space_info->disk_used -= num_bytes * factor; + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + old_val = cache->used; + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + cache->reclaim_mark = 0; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; + if (READ_ONCE(space_info->periodic_reclaim)) + btrfs_space_info_update_reclaimable(space_info, -num_bytes); + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; + if (READ_ONCE(space_info->periodic_reclaim)) + btrfs_space_info_update_reclaimable(space_info, num_bytes); + else reclaim = should_reclaim_block_group(cache, num_bytes); - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - - set_extent_bit(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL); - } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + } - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) { - if (!btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(cache); - } else if (!alloc && reclaim) { - btrfs_mark_bg_to_reclaim(cache); - } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; + /* + * No longer have used bytes in this block group, queue it for deletion. + * We do this after adding the block group to the dirty list to avoid + * races between cleaner kthread and space cache writeout. + */ + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } + btrfs_put_block_group(cache); + /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); - return ret; + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(info); + + return 0; } /* @@ -3819,8 +3874,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4195,7 +4250,7 @@ out: return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4336,13 +4391,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) spin_lock(&block_group->lock); if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { - struct inode *inode = block_group->inode; + struct btrfs_inode *inode = block_group->inode; block_group->inode = NULL; spin_unlock(&block_group->lock); ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); + iput(&inode->vfs_inode); } else { spin_unlock(&block_group->lock); } @@ -4487,8 +4542,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache) void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; bool cleanup; spin_lock(&block_group->lock); @@ -4497,17 +4550,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->start, - 1); - BUG_ON(!em); /* logic error, can't happen */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map(fs_info, block_group->start, 1); + /* Logic error, can't happen. */ + ASSERT(map); + + btrfs_remove_chunk_map(fs_info, map); + + /* Once for our lookup reference. */ + btrfs_free_chunk_map(map); /* * We may have left one free space entry and other possible @@ -4602,7 +4654,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 089979981e4a..36937eeab9b8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -3,8 +3,23 @@ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/wait.h> +#include <linux/sizes.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" +struct btrfs_chunk_map; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_trans_handle; + enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, @@ -100,7 +115,7 @@ struct btrfs_caching_control { struct btrfs_block_group { struct btrfs_fs_info *fs_info; - struct inode *inode; + struct btrfs_inode *inode; spinlock_t lock; u64 start; u64 length; @@ -243,14 +258,15 @@ struct btrfs_block_group { u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; - struct map_lookup *physical_map; + struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; enum btrfs_block_group_size_class size_class; + u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -262,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -274,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -295,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); -void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, @@ -304,7 +318,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); + struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); @@ -355,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -372,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index db8da4e7b228..a07b9594dc70 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,7 +6,6 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" -#include "disk-io.h" #include "fs.h" #include "accessors.h" @@ -221,7 +220,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -261,7 +261,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -279,10 +280,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *target = NULL; /* - * If we are the delayed_rsv then push to the global rsv, otherwise dump - * into the delayed rsv if it is not full. + * If we are a delayed block reserve then push to the global rsv, + * otherwise dump into the global delayed reserve if it is not full. */ - if (block_rsv == delayed_rsv) + if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS) target = global_rsv; else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; @@ -340,9 +341,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) read_lock(&fs_info->global_root_lock); rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, rb_node) { - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) { num_bytes += btrfs_root_used(&root->root_item); min_items++; } @@ -354,6 +355,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) min_items++; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional @@ -400,11 +406,12 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - switch (root->root_key.objectid) { + switch (btrfs_root_id(root)) { case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + case BTRFS_RAID_STRIPE_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: @@ -461,8 +468,7 @@ static struct btrfs_block_rsv *get_block_rsv( if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || (root == fs_info->uuid_root) || - (trans->adding_csums && - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) + (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -517,8 +523,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -539,7 +545,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; @@ -547,7 +553,7 @@ try_reserve: return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 43a9a6b5a79f..d12b1fac5c74 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,8 +3,15 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/spinlock.h> + struct btrfs_trans_handle; struct btrfs_root; +struct btrfs_space_info; +struct btrfs_block_rsv; +struct btrfs_fs_info; enum btrfs_reserve_flush_enum; /* @@ -82,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ec6679a538c1..db53a3263fbd 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,11 +8,31 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/compiler.h> +#include <linux/fscrypt.h> +#include <linux/lockdep.h> +#include <uapi/linux/btrfs_tree.h> +#include <trace/events/btrfs.h> +#include "block-rsv.h" #include "extent_map.h" #include "extent_io.h" +#include "extent-io-tree.h" #include "ordered-data.h" #include "delayed-inode.h" +struct extent_state; +struct posix_acl; +struct iov_iter; +struct writeback_control; +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_trans_handle; + /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so @@ -39,7 +59,6 @@ enum { */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, - BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* @@ -67,6 +86,41 @@ enum { BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, + /* Set when there are no capabilities in XATTs for the inode. */ + BTRFS_INODE_NO_CAP_XATTR, + /* + * Set if an error happened when doing a COW write before submitting a + * bio or during writeback. Used for both buffered writes and direct IO + * writes. This is to signal a fast fsync that it has to wait for + * ordered extents to complete and therefore not log extent maps that + * point to unwritten extents (when an ordered extent completes and it + * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its + * range). + */ + BTRFS_INODE_COW_WRITE_ERROR, + /* + * Indicate this is a directory that points to a subvolume for which + * there is no root reference item. That's a case like the following: + * + * $ btrfs subvolume create /mnt/parent + * $ btrfs subvolume create /mnt/parent/child + * $ btrfs subvolume snapshot /mnt/parent /mnt/snap + * + * If subvolume "parent" is root 256, subvolume "child" is root 257 and + * snapshot "snap" is root 258, then there's no root reference item (key + * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child" + * associated to root 258 (the snapshot) - there's only for the root + * of the "parent" subvolume (root 256). In the chunk root we have a + * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a + * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we + * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a + * (257 BTRFS_ROOT_BACKREF_KEY 258) key. + * + * So when opening the "child" dentry from the snapshot's directory, + * we don't find a root ref item and we create a stub inode. This is + * done at new_simple_dir(), called from btrfs_lookup_dentry(). + */ + BTRFS_INODE_ROOT_STUB, }; /* in memory btrfs inode */ @@ -74,10 +128,23 @@ struct btrfs_inode { /* which subvolume this inode belongs to */ struct btrfs_root *root; - /* key used to find this inode on disk. This is used by the code - * to read in roots of subvolumes +#if BITS_PER_LONG == 32 + /* + * The objectid of the corresponding BTRFS_INODE_ITEM_KEY. + * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an + * unsigned long and therefore 64 bits on such platforms. + */ + u64 objectid; +#endif + + /* Cached value of inode property 'compression'. */ + u8 prop_compress; + + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set. */ - struct btrfs_key location; + u8 defrag_compress; /* * Lock for counters and all fields used to determine if the inode is in @@ -97,9 +164,11 @@ struct btrfs_inode { /* * Keep track of where the inode has extent items mapped in order to - * make sure the i_size adjustments are accurate + * make sure the i_size adjustments are accurate. Not required when the + * filesystem is NO_HOLES, the status can't be set while mounted as + * it's a mkfs-time feature. */ - struct extent_io_tree file_extent_tree; + struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -113,7 +182,9 @@ struct btrfs_inode { unsigned outstanding_extents; /* used to order data wrt metadata */ - struct btrfs_ordered_inode_tree ordered_tree; + spinlock_t ordered_tree_lock; + struct rb_root ordered_tree; + struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used @@ -121,9 +192,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* node for the red-black tree that links inodes in subvolume root */ - struct rb_node rb_node; - unsigned long runtime_flags; /* full 64 bit generation number, struct vfs_inode doesn't have a big @@ -184,11 +252,20 @@ struct btrfs_inode { u64 last_dir_index_offset; }; - /* - * Total number of bytes pending defrag, used by stat to check whether - * it needs COW. Protected by 'lock'. - */ - u64 defrag_bytes; + union { + /* + * Total number of bytes pending defrag, used by stat to check whether + * it needs COW. Protected by 'lock'. + * Used by inodes other than the data relocation inode. + */ + u64 defrag_bytes; + + /* + * Logical address of the block group being relocated. + * Used only by the data relocation inode. + */ + u64 reloc_block_group_start; + }; /* * The size of the file stored in the metadata on disk. data=ordered @@ -197,12 +274,21 @@ struct btrfs_inode { */ u64 disk_i_size; - /* - * If this is a directory then index_cnt is the counter for the index - * number for new files that are created. For an empty directory, this - * must be initialized to BTRFS_DIR_START_INDEX. - */ - u64 index_cnt; + union { + /* + * If this is a directory then index_cnt is the counter for the + * index number for new files that are created. For an empty + * directory, this must be initialized to BTRFS_DIR_START_INDEX. + */ + u64 index_cnt; + + /* + * If this is not a directory, this is the number of bytes + * outstanding that are going to need csums. This is used in + * ENOSPC accounting. Protected by 'lock'. + */ + u64 csum_bytes; + }; /* Cache the directory index number to speed the dir/file remove */ u64 dir_index; @@ -214,22 +300,25 @@ struct btrfs_inode { */ u64 last_unlink_trans; - /* - * The id/generation of the last transaction where this inode was - * either the source or the destination of a clone/dedupe operation. - * Used when logging an inode to know if there are shared extents that - * need special care when logging checksum items, to avoid duplicate - * checksum items in a log (which can lead to a corruption where we end - * up with missing checksum ranges after log replay). - * Protected by the vfs inode lock. - */ - u64 last_reflink_trans; + union { + /* + * The id/generation of the last transaction where this inode + * was either the source or the destination of a clone/dedupe + * operation. Used when logging an inode to know if there are + * shared extents that need special care when logging checksum + * items, to avoid duplicate checksum items in a log (which can + * lead to a corruption where we end up with missing checksum + * ranges after log replay). Protected by the VFS inode lock. + * Used for regular files only. + */ + u64 last_reflink_trans; - /* - * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. Protected by 'lock'. - */ - u64 csum_bytes; + /* + * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set), + * the ID of that root. + */ + u64 ref_root_id; + }; /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; @@ -238,20 +327,11 @@ struct btrfs_inode { struct btrfs_block_rsv block_rsv; - /* - * Cached values of inode properties - */ - unsigned prop_compress; /* per-file compression algorithm */ - /* - * Force compression on the file using the defrag ioctl, could be - * different from prop_compress and takes precedence if set - */ - unsigned defrag_compress; - struct btrfs_delayed_node *delayed_node; /* File creation time. */ - struct timespec64 i_otime; + u64 i_otime_sec; + u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; @@ -271,10 +351,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) @@ -296,10 +378,9 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { - u64 ino = inode->location.objectid; + u64 ino = inode->objectid; - /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ - if (inode->location.type == BTRFS_ROOT_ITEM_KEY) + if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags)) ino = inode->vfs_inode.i_ino; return ino; } @@ -313,20 +394,36 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode) #endif +static inline void btrfs_get_inode_key(const struct btrfs_inode *inode, + struct btrfs_key *key) +{ + key->objectid = btrfs_ino(inode); + key->type = BTRFS_INODE_ITEM_KEY; + key->offset = 0; +} + +static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino) +{ +#if BITS_PER_LONG == 32 + inode->objectid = ino; +#endif + inode->vfs_inode.i_ino = ino; +} + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); inode->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) +static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode) { return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } -static inline bool is_data_inode(struct inode *inode) +static inline bool is_data_inode(const struct btrfs_inode *inode) { - return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; + return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID; } static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, @@ -356,9 +453,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) } /* - * Should be called while holding the inode's VFS lock in exclusive mode or in a - * context where no one else can access the inode concurrently (during inode - * creation or when loading an inode from disk). + * Should be called while holding the inode's VFS lock in exclusive mode, or + * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in + * either shared or exclusive mode, or in a context where no one else can access + * the inode concurrently (during inode creation or when loading an inode from + * disk). */ static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) { @@ -392,7 +491,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) + inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; @@ -409,6 +508,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -418,10 +525,10 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); + struct btrfs_file_extent *file_extent, + bool nowait, bool strict); -void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -471,7 +578,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -479,16 +585,15 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); @@ -502,24 +607,27 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, + u64 disk_bytenr, u64 disk_io_size, struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size); +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino); extern const struct dentry_operations btrfs_dentry_operations; @@ -535,5 +643,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes); +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type); #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c deleted file mode 100644 index 3caf339c4bb3..000000000000 --- a/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,2871 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and no write error was indicated and a - * FLUSH request to the device where these blocks are - * located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - * - * Expect millions of lines of information in the kernel log with an - * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the - * kernel config to at least 26 (which is 64MB). Usually the value is - * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be - * changed like this before LOG_BUF_SHIFT can be set to a high value: - * config LOG_BUF_SHIFT - * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" - * range 12 30 - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/blkdev.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <crypto/hash.h> -#include "messages.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" -#include "rcu-string.h" -#include "compression.h" -#include "accessors.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_private; - bio_end_io_t *orig_bio_end_io; - blk_opf_t submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block referred from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char **datav; - struct page **pagev; - void *mem_to_free; -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_fs_info *fs_info; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; - u32 metablock_size; - u32 datablock_size; -}; - -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - int limit_nesting, int force_iodone_flag); -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dst, u32 offset, size_t len); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_private = NULL; - b->orig_bio_end_io = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct btrfsic_block *b; - - list_for_each_entry(b, h->table + hashval, collision_resolving_node) { - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct btrfsic_block_link *l; - - list_for_each_entry(l, h->table + hashval, collision_resolving_node) { - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); - struct btrfsic_dev_state *ds; - - list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev->bd_dev == dev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int ret = 0; - int pass; - - selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (!selected_super) - return -ENOMEM; - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - pr_info("btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(state->fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_SIZE) { - pr_info("btrfsic: read @logical %llu failed!\n", - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - struct page *page; - struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; - int ret = 0; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) - return -1; - - page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return -1; - - super_tmp = page_address(page); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - btrfs_super_magic(super_tmp) != BTRFS_MAGIC || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || - btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - ret = 0; - goto out; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - ret = -1; - goto out; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", - superblock_bdev, - btrfs_dev_name(device), dev_bytenr, - dev_state->bdev, dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num)) { - pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - ret = -1; - goto out; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - ret = -1; - goto out; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - ret = -1; - goto out; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - -out: - put_page(page); - return ret; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (sf) - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static noinline_for_stack int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - struct btrfs_header *const first_hdr = - (struct btrfs_header *)first_block_ctx->datav[0]; - - BUG_ON(!first_hdr); - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = btrfs_stack_header_generation(sf->hdr); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&leafhdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("leaf %llu items %d generation %llu owner %llu\n", - sf->block_ctx->start, sf->nr, - btrfs_stack_header_generation( - &leafhdr->header), - btrfs_stack_header_owner( - &leafhdr->header)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item disk_item; - u32 disk_item_offset = - (uintptr_t)(leafhdr->items + sf->i) - - (uintptr_t)leafhdr; - struct btrfs_disk_key *disk_key; - u8 type; - u32 item_offset; - u32 item_size; - - if (disk_item_offset + sizeof(struct btrfs_item) > - sf->block_ctx->len) { -leaf_item_out_of_bounce_error: - pr_info( - "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data(sf->block_ctx, - &disk_item, - disk_item_offset, - sizeof(struct btrfs_item)); - item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_size(&disk_item); - disk_key = &disk_item.key; - type = btrfs_disk_key_type(disk_key); - - if (BTRFS_ROOT_ITEM_KEY == type) { - struct btrfs_root_item root_item; - u32 root_item_offset; - u64 next_bytenr; - - root_item_offset = item_offset + - offsetof(struct btrfs_leaf, items); - if (root_item_offset + item_size > - sf->block_ctx->len) - goto leaf_item_out_of_bounce_error; - btrfsic_read_from_block_data( - sf->block_ctx, &root_item, - root_item_offset, - item_size); - next_bytenr = btrfs_root_bytenr(&root_item); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - btrfs_root_generation( - &root_item)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&nodehdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("node %llu level %d items %d generation %llu owner %llu\n", - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - btrfs_stack_header_generation( - &nodehdr->header), - btrfs_stack_header_owner( - &nodehdr->header)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr key_ptr; - u32 key_ptr_offset; - u64 next_bytenr; - - key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - - (uintptr_t)nodehdr; - if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > - sf->block_ctx->len) { - pr_info( - "btrfsic: node item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data( - sf->block_ctx, &key_ptr, key_ptr_offset, - sizeof(struct btrfs_key_ptr)); - next_bytenr = btrfs_stack_key_blockptr(&key_ptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &key_ptr.key, - btrfs_stack_key_generation(&key_ptr)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dstv, u32 offset, size_t len) -{ - size_t cur; - size_t pgoff; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = offset_in_page(block_ctx->start); - unsigned long i = (start_offset + offset) >> PAGE_SHIFT; - - WARN_ON(offset + len > block_ctx->len); - pgoff = offset_in_page(start_offset + offset); - - while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - pgoff)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); - kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + pgoff, cur); - - dst += cur; - len -= cur; - pgoff = 0; - i++; - } -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - next_block_ctx, *mirror_nump); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block), - next_block->logical_bytenr); - else - pr_info( - "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block)); - } - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)next_block_ctx->len) { - pr_info("btrfsic: read block @logical %llu failed!\n", - next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_file_extent_item file_extent_item; - u64 file_extent_item_offset; - u64 next_bytenr; - u64 num_bytes; - u64 generation; - struct btrfsic_block_link *l; - int ret; - - file_extent_item_offset = offsetof(struct btrfs_leaf, items) + - item_offset; - if (file_extent_item_offset + - offsetof(struct btrfs_file_extent_item, disk_num_bytes) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - offsetof(struct btrfs_file_extent_item, disk_num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr( - &file_extent_item)); - return 0; - } - - if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); - if (btrfs_stack_file_extent_compression(&file_extent_item) == - BTRFS_COMPRESS_NONE) { - next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); - } else { - num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); - } - generation = btrfs_stack_file_extent_generation(&file_extent_item); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr(&file_extent_item), - btrfs_stack_file_extent_offset(&file_extent_item), - num_bytes); - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > state->datablock_size) - chunk_len = state->datablock_size; - else - chunk_len = num_bytes; - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->datablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n", - mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("\tdisk_bytenr = %llu, num_bytes %u\n", - next_bytenr, chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if ((state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) && - next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", - next_bytenr, - next_block_ctx.dev->bdev, - next_block_ctx.dev_bytenr, - mirror_num, - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int ret; - u64 length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap, *map; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, - NULL, &mirror_num, 0); - if (ret) { - block_ctx_out->start = 0; - block_ctx_out->dev_bytenr = 0; - block_ctx_out->len = 0; - block_ctx_out->dev = NULL; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - return ret; - } - - if (bioc) - map = &bioc->stripes[0]; - else - map = &smap; - - device = map->dev; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev || !device->name) - block_ctx_out->dev = NULL; - else - block_ctx_out->dev = btrfsic_dev_state_lookup( - device->bdev->bd_dev); - block_ctx_out->dev_bytenr = map->physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - kfree(bioc); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (block_ctx->mem_to_free) { - unsigned int num_pages; - - BUG_ON(!block_ctx->datav); - BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Pages must be unmapped in reverse order */ - while (num_pages > 0) { - num_pages--; - if (block_ctx->datav[num_pages]) - block_ctx->datav[num_pages] = NULL; - if (block_ctx->pagev[num_pages]) { - __free_page(block_ctx->pagev[num_pages]); - block_ctx->pagev[num_pages] = NULL; - } - } - - kfree(block_ctx->mem_to_free); - block_ctx->mem_to_free = NULL; - block_ctx->pagev = NULL; - block_ctx->datav = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - unsigned int num_pages; - unsigned int i; - size_t size; - u64 dev_bytenr; - int ret; - - BUG_ON(block_ctx->datav); - BUG_ON(block_ctx->pagev); - BUG_ON(block_ctx->mem_to_free); - if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { - pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", - block_ctx->dev_bytenr); - return -1; - } - - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); - block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); - if (!block_ctx->mem_to_free) - return -ENOMEM; - block_ctx->datav = block_ctx->mem_to_free; - block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); - if (ret) - return ret; - - dev_bytenr = block_ctx->dev_bytenr; - for (i = 0; i < num_pages;) { - struct bio *bio; - unsigned int j; - - bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, - REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; - - for (j = i; j < num_pages; j++) { - ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_SIZE, 0); - if (PAGE_SIZE != ret) - break; - } - if (j == i) { - pr_info("btrfsic: error, failed to add a single page!\n"); - return -1; - } - if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %pg!\n", - block_ctx->start, block_ctx->dev->bdev); - bio_put(bio); - return -1; - } - bio_put(bio); - dev_bytenr += (j - i) * PAGE_SIZE; - i = j; - } - for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = page_address(block_ctx->pagev[i]); - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - const struct btrfsic_block *b_all; - - BUG_ON(NULL == state); - - pr_info("all_blocks_list:\n"); - list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { - const struct btrfsic_block_link *l; - - pr_info("%c-block @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - - list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info( - " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info( - " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - pr_info("\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static noinline_for_stack int btrfsic_test_for_metadata( - struct btrfsic_state *state, - char **datav, unsigned int num_pages) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - unsigned int i; - - if (num_pages * PAGE_SIZE < state->metablock_size) - return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_SHIFT; - h = (struct btrfs_header *)datav[0]; - - if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) - return 1; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - for (i = 0; i < num_pages; i++) { - u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_SIZE : - (PAGE_SIZE - BTRFS_CSUM_SIZE); - - crypto_shash_update(shash, data, sublen); - } - crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, fs_info->csum_size)) - return 1; - - return 0; /* is metadata */ -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - blk_opf_t submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - unsigned int processed_len; - - if (NULL != bio_is_patched) - *bio_is_patched = 0; - -again: - if (num_pages == 0) - return; - - processed_len = 0; - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, - num_pages)); - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr = 0; - struct btrfsic_block_link *l, *tmp; - - if (block->is_superblock) { - bytenr = btrfs_super_bytenr((struct btrfs_super_block *) - mapped_datav[0]); - if (num_pages * PAGE_SIZE < - BTRFS_SUPER_INFO_SIZE) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - is_metadata = 1; - BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); - processed_len = BTRFS_SUPER_INFO_SIZE; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - pr_info("[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - if (num_pages * PAGE_SIZE < - state->metablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr); - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - pr_info( -"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - bytenr, dev_state->bdev, - dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, - block), - block->logical_bytenr); - else - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, - dev_bytenr, block->mirror_num, - btrfsic_get_block_type(state, - block)); - } - block->logical_bytenr = bytenr; - } else { - if (num_pages * PAGE_SIZE < - state->datablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->datablock_size; - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_disk_key_objectid(&block->disk_key), - block->disk_key.type, - btrfs_disk_key_offset(&block->disk_key), - btrfs_stack_header_generation( - (struct btrfs_header *) mapped_datav[0]), - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_stack_header_generation( - (struct btrfs_header *) - mapped_datav[0])); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - goto continue_loop; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valuable information - * like whether it was ever written and IO completed. - */ - list_for_each_entry_safe(l, tmp, &block->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = - bio->bi_private; - block->orig_bio_end_io = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - BUG_ON(PAGE_SIZE != - BTRFS_SUPER_INFO_SIZE); - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_datav[0]); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - pr_info("[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - 0, 0); - } - if (ret) - pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n", - dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - processed_len = state->datablock_size; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block (%pg/%llu/?) !found in hash table, D\n", - dev_state->bdev, dev_bytenr); - if (!state->include_extent_data) { - /* ignore that written D block */ - goto continue_loop; - } - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - } else { - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/?) !found in hash table, M\n", - bytenr, dev_state->bdev, dev_bytenr); - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - block = btrfsic_block_alloc(); - if (NULL == block) { - btrfsic_release_block_ctx(&block_ctx); - goto continue_loop; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", - is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, 0, 0); - if (ret) - pr_info("btrfsic: process_metablock(root @%llu) failed!\n", - dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } - -continue_loop: - BUG_ON(!processed_len); - dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_SHIFT; - num_pages -= processed_len >> PAGE_SHIFT; - goto again; -} - -static void btrfsic_bio_end_io(struct bio *bp) -{ - struct btrfsic_block *block = bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bp->bi_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_private; - bp->bi_end_io = block->orig_bio_end_io; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", - bp->bi_status, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %pg flush_gen=%llu\n", - dev_state->bdev, - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key = {0}; - - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_ITEM_KEY); - btrfs_set_disk_key_objectid(&tmp_disk_key, 0); - - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - BTRFS_SUPER_INFO_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, - BTRFS_SUPER_INFO_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) - btrfsic_dump_tree(state); - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage information. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->block_ref_to->iodone_w_error) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - l->block_ref_to->generation, - l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, block->flush_gen, - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_from_list, node_ref_from) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - const struct btrfsic_block_link *l; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (!l) - return NULL; - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (!block) - return NULL; - - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); - if (NULL == dev_state) { - pr_info("btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block_data_ctx block_ctx; - int num_copies; - int mirror_num; - int match = 0; - int ret; - - num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, state->metablock_size, - &block_ctx, mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n", - bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (WARN_ON(!match)) { - pr_info( -"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", - bytenr, dev_state->bdev, dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, - state->metablock_size, - &block_ctx, mirror_num); - if (ret) - continue; - - pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", - bytenr, block_ctx.dev->bdev, - block_ctx.dev_bytenr, mirror_num); - } - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) -{ - return btrfsic_dev_state_hashtable_lookup(dev, - &btrfsic_dev_state_hashtable); -} - -static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - unsigned int segs = bio_segments(bio); - u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; - u64 cur_bytenr = dev_bytenr; - struct bvec_iter iter; - struct bio_vec bvec; - char **mapped_datav; - int bio_is_patched = 0; - int i = 0; - - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info( -"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - return; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - - btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, - bio, &bio_is_patched, bio->bi_opf); - kfree(mapped_datav); -} - -static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - - if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } else if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) { - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } -} - -void btrfsic_check_bio(struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return; - - /* - * We can be called before btrfsic_mount, so there might not be a - * dev_state. - */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - mutex_lock(&btrfsic_mutex); - if (dev_state) { - if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) - btrfsic_check_write_bio(bio, dev_state); - else if (bio->bi_opf & REQ_PREFLUSH) - btrfsic_check_flush_bio(bio, dev_state); - } - mutex_unlock(&btrfsic_mutex); -} - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!PAGE_ALIGNED(fs_info->nodesize)) { - pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->nodesize, PAGE_SIZE); - return -1; - } - if (!PAGE_ALIGNED(fs_info->sectorsize)) { - pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->sectorsize, PAGE_SIZE); - return -1; - } - state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return -ENOMEM; - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->fs_info = fs_info; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->metablock_size = fs_info->nodesize; - state->datablock_size = fs_info->sectorsize; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - mutex_unlock(&btrfsic_mutex); - return -ENOMEM; - } - ds->bdev = device->bdev; - ds->state = state; - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) -{ - struct btrfsic_block *b_all, *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev->bd_dev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - pr_info("btrfsic: error, cannot find state information on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, - all_blocks_node) { - struct btrfsic_block_link *l, *tmp; - - list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone || b_all->never_written) - btrfsic_block_free(b_all); - else - pr_info( -"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kvfree(state); -} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h deleted file mode 100644 index e4c8aed7996f..000000000000 --- a/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -#ifndef BTRFS_CHECK_INTEGRITY_H -#define BTRFS_CHECK_INTEGRITY_H - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_check_bio(struct bio *bio); -#else -static inline void btrfsic_check_bio(struct bio *bio) { } -#endif - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e6acf09a1507..65d883da86c6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -20,12 +20,11 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <linux/shrinker.h> #include <crypto/hash.h> #include "misc.h" #include "ctree.h" #include "fs.h" -#include "disk-io.h" -#include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "ordered-data.h" @@ -33,8 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "subpage.h" -#include "zoned.h" -#include "file-item.h" +#include "messages.h" #include "super.h" static struct bio_set btrfs_compressed_bioset; @@ -92,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, struct page **pages, - unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out) + struct address_space *mapping, u64 start, + struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zlib_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return lzo_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zstd_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: /* @@ -117,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws, * Not a big deal, just need to inform caller that we * haven't allocated any pages yet. */ - *out_pages = 0; + *out_folios = 0; return -E2BIG; } } @@ -140,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -160,16 +158,110 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_pages(struct compressed_bio *cb) +static void btrfs_free_compressed_folios(struct compressed_bio *cb) { - for (unsigned int i = 0; i < cb->nr_pages; i++) - put_page(cb->compressed_pages[i]); - kfree(cb->compressed_pages); + for (unsigned int i = 0; i < cb->nr_folios; i++) + btrfs_free_compr_folio(cb->compressed_folios[i]); + kfree(cb->compressed_folios); } static int btrfs_decompress_bio(struct compressed_bio *cb); -static void end_compressed_bio_read(struct btrfs_bio *bbio) +/* + * Global cache of last unused pages for compression/decompression. + */ +static struct btrfs_compr_pool { + struct shrinker *shrinker; + spinlock_t lock; + struct list_head list; + int count; + int thresh; +} compr_pool; + +static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc) +{ + int ret; + + /* + * We must not read the values more than once if 'ret' gets expanded in + * the return statement so we don't accidentally return a negative + * number, even if the first condition finds it positive. + */ + ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh); + + return ret > 0 ? ret : 0; +} + +static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) +{ + struct list_head remove; + struct list_head *tmp, *next; + int freed; + + if (compr_pool.count == 0) + return SHRINK_STOP; + + INIT_LIST_HEAD(&remove); + + /* For now, just simply drain the whole list. */ + spin_lock(&compr_pool.lock); + list_splice_init(&compr_pool.list, &remove); + freed = compr_pool.count; + compr_pool.count = 0; + spin_unlock(&compr_pool.lock); + + list_for_each_safe(tmp, next, &remove) { + struct page *page = list_entry(tmp, struct page, lru); + + ASSERT(page_ref_count(page) == 1); + put_page(page); + } + + return freed; +} + +/* + * Common wrappers for page allocation from compression wrappers + */ +struct folio *btrfs_alloc_compr_folio(void) +{ + struct folio *folio = NULL; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > 0) { + folio = list_first_entry(&compr_pool.list, struct folio, lru); + list_del_init(&folio->lru); + compr_pool.count--; + } + spin_unlock(&compr_pool.lock); + + if (folio) + return folio; + + return folio_alloc(GFP_NOFS, 0); +} + +void btrfs_free_compr_folio(struct folio *folio) +{ + bool do_free = false; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > compr_pool.thresh) { + do_free = true; + } else { + list_add(&folio->lru, &compr_pool.list); + compr_pool.count++; + } + spin_unlock(&compr_pool.lock); + + if (!do_free) + return; + + ASSERT(folio_ref_count(folio) == 1); + folio_put(folio); +} + +static void end_bbio_compressed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; @@ -177,7 +269,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -189,16 +281,16 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (errno) - mapping_set_error(inode->i_mapping, errno); + if (error) + mapping_set_error(inode->i_mapping, error); folio_batch_init(&fbatch); while (index <= end_index) { @@ -211,8 +303,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - btrfs_page_clamp_clear_writeback(fs_info, &folio->page, - cb->start, cb->len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, + cb->start, cb->len); } folio_batch_release(&fbatch); } @@ -231,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) end_compressed_writeback(cb); /* Note, our inode could be gone now */ - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); bio_put(&cb->bbio.bio); } @@ -242,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct btrfs_bio *bbio) +static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; @@ -250,17 +342,19 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio) queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } -static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + int ret; u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); /* Maximum compressed extent is smaller than bio size limit. */ - __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], + len, 0); + ASSERT(ret); offset += len; } } @@ -275,12 +369,12 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, bool writeback) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct compressed_bio *cb; @@ -289,19 +383,19 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb = alloc_compressed_bio(inode, ordered->file_offset, REQ_OP_WRITE | write_flags, - end_compressed_bio_write); + end_bbio_compressed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_pages = compressed_pages; + cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); - cb->nr_pages = nr_pages; + cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -320,13 +414,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, struct compressed_bio *cb, int *memstall, unsigned long *pflags) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -346,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * This makes readahead less effective, so here disable readahead for * subpage for now, until full compressed write is supported. */ - if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + if (fs_info->sectorsize < PAGE_SIZE) return 0; end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -359,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = __filemap_get_folio(mapping, pg_index, 0, 0); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -372,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -413,31 +511,32 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { + (extent_map_block_start(em) >> SECTOR_SHIFT) != + orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { - unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { + folio_unlock(folio); + folio_put(folio); break; } /* @@ -446,8 +545,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page, cur, add_size); - put_page(page); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); + folio_put(folio); cur += add_size; } return 0; @@ -489,31 +588,31 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out; } - ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); - compressed_len = em->block_len; + ASSERT(extent_map_is_compressed(em)); + compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, - end_compressed_bio_read); + end_bbio_compressed_read); - cb->start = em->orig_start; + cb->start = em->start - em->offset; em_len = em->len; em_start = em->start; cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = em->compress_type; + cb->compress_type = extent_map_compression(em); cb->orig_bbio = bbio; free_extent_map(em); - cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) { + cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!cb->compressed_folios) { ret = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -525,16 +624,16 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: - kfree(cb->compressed_pages); + kfree(cb->compressed_folios); out_free_bio: bio_put(&cb->bbio.bio); out: @@ -881,6 +980,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) return level; } +/* Wrapper around find_get_page(), with extra error message. */ +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret) +{ + struct folio *in_folio; + + /* + * The compressed write path should have the folio locked already, thus + * we only need to grab one reference. + */ + in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT); + if (IS_ERR(in_folio)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_crit(inode->root->fs_info, + "failed to get page cache, root %lld ino %llu file offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), start); + return -ENOENT; + } + *in_folio_ret = in_folio; + return 0; +} + /* * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. @@ -901,11 +1023,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); @@ -914,8 +1034,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, pages, - out_pages, total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, folios, + out_folios, total_in, total_out); put_workspace(type, workspace); return ret; } @@ -940,10 +1060,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -956,7 +1076,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); @@ -969,15 +1089,36 @@ int __init btrfs_init_compress(void) offsetof(struct compressed_bio, bbio.bio), BIOSET_NEED_BVECS)) return -ENOMEM; + + compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages"); + if (!compr_pool.shrinker) + return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + + spin_lock_init(&compr_pool.lock); + INIT_LIST_HEAD(&compr_pool.list); + compr_pool.count = 0; + /* 128K / 4K = 32, for 8 threads is 256 pages. */ + compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8; + compr_pool.shrinker->count_objects = btrfs_compr_pool_count; + compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan; + compr_pool.shrinker->batch = 32; + compr_pool.shrinker->seeks = DEFAULT_SEEKS; + shrinker_register(compr_pool.shrinker); + return 0; } void __cold btrfs_exit_compress(void) { + /* For now scan drains all pages and does not touch the parameters. */ + btrfs_compr_pool_scan(NULL, NULL); + shrinker_free(compr_pool.shrinker); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); @@ -1362,11 +1503,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, /* * Compression heuristic. * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics - * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * * The following types of analysis can be performed: * - detect mostly zero data * - detect data with low "byte set" size (text, etc) @@ -1374,7 +1510,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * Return non-zero if the compression should be done, 0 otherwise. */ -int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) +int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { struct list_head *ws_list = get_workspace(0, 0); struct heuristic_ws *ws; @@ -1384,7 +1520,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) ws = list_entry(ws_list, struct heuristic_ws, list); - heuristic_collect_sample(inode, start, end, ws); + heuristic_collect_sample(&inode->vfs_inode, start, end, ws); if (sample_repeated_patterns(ws)) { ret = 1; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 609865c94065..b6563b6a333e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -7,10 +7,18 @@ #define BTRFS_COMPRESSION_H #include <linux/sizes.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/wait.h> #include "bio.h" +struct address_space; +struct page; +struct inode; struct btrfs_inode; struct btrfs_ordered_extent; +struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -33,11 +41,11 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of compressed pages in the array */ - unsigned int nr_pages; + /* Number of compressed folios in the array. */ + unsigned int nr_folios; - /* the pages with the compressed data on them */ - struct page **compressed_pages; + /* The folios with the compressed data on them. */ + struct folio **compressed_folios; /* starting offset in the inode for our pages */ u64 start; @@ -74,28 +82,36 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out); +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback); + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, + bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +struct folio *btrfs_alloc_compr_folio(void); +void btrfs_free_compr_folio(struct folio *folio); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, @@ -136,35 +152,38 @@ extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); -int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); + +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret); -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(unsigned int level); -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 25c902e7556d..3ba15d9c3e88 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p) * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ -bool __cold abort_should_print_stack(int errno) +bool __cold abort_should_print_stack(int error) { - switch (errno) { + switch (error) { case -EIO: case -EROFS: case -ENOMEM: @@ -291,7 +291,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else @@ -316,11 +316,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - trans->transid != root->last_trans); + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); if (level == 0) @@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, - BTRFS_NESTING_NEW_ROOT); + reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -367,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) { + const u64 buf_gen = btrfs_header_generation(buf); + /* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared. */ - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { - if (buf != root->commit_root) - return 1; - /* - * An extent buffer that used to be the commit root may still be - * shared because the tree height may have increased and it - * became a child of a higher level root. This can happen when - * snapshotting a subvolume created in the current transaction. - */ - if (btrfs_header_generation(buf) == trans->transid) - return 1; - } - return 0; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return false; + + if (buf == root->node) + return false; + + if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return false; + + if (buf != root->commit_root) + return true; + + /* + * An extent buffer that used to be the commit root may still be shared + * because the tree height may have increased and it became a child of a + * higher level root. This can happen when snapshotting a subvolume + * created in the current transaction. + */ + if (buf_gen == trans->transid) + return true; + + return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, @@ -406,7 +417,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, u64 refs; u64 owner; u64 flags; - u64 new_flags = 0; int ret; /* @@ -429,7 +439,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { @@ -443,7 +453,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else @@ -463,15 +473,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); if (ret) return ret; @@ -479,26 +488,22 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + ret = btrfs_set_disk_extent_flags(trans, buf, + BTRFS_BLOCK_FLAG_FULL_BACKREF); + if (ret) + return ret; } else { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; } - if (new_flags != 0) { - ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); - if (ret) - return ret; - } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); @@ -526,13 +531,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - enum btrfs_lock_nesting nest) +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -541,6 +546,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, int last_ref = 0; int unlock_orig = 0; u64 parent_start = 0; + u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; @@ -550,7 +556,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - trans->transid != root->last_trans); + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); @@ -559,12 +565,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) - parent_start = parent->start; - + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + reloc_src_root = btrfs_header_owner(buf); + } cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size, nest); + btrfs_root_id(root), &disk_key, level, + search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -576,43 +584,37 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else - btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_set_header_owner(cow, btrfs_root_id(root)); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } if (buf == root->node) { WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); if (ret < 0) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); @@ -622,20 +624,16 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, free_extent_buffer(buf); add_root_to_dirty_list(root); if (ret < 0) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } btrfs_set_node_blockptr(parent, parent_slot, cow->start); @@ -645,27 +643,30 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); if (ret < 0) { - btrfs_tree_unlock(cow); - free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); - return ret; + goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; + +error_unlock_cow: + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + return ret; } static inline int should_cow_block(struct btrfs_trans_handle *trans, @@ -691,7 +692,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; @@ -699,11 +700,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, } /* - * cows a single block, see __btrfs_cow_block for the real work. + * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, @@ -711,7 +712,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -743,7 +743,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)SZ_1G - 1); + search_start = round_down(buf->start, SZ_1G); /* * Before CoWing this block for later modification, check if it's @@ -752,59 +752,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -#ifdef __LITTLE_ENDIAN - -/* - * Compare two keys, on little-endian the disk order is same as CPU order and - * we can avoid the conversion. - */ -static int comp_keys(const struct btrfs_disk_key *disk_key, - const struct btrfs_key *k2) -{ - const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; - - return btrfs_comp_cpu_keys(k1, k2); -} - -#else - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(const struct btrfs_disk_key *disk, - const struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); -} -#endif - -/* * same as comp_keys only with two btrfs_key's */ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) @@ -825,105 +778,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke } /* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *cur; - u64 blocknr; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - /* - * COWing must happen through a running transaction, which always - * matches the current fs generation (it's a transaction with a state - * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs - * into error state to prevent the commit of any transaction. - */ - if (unlikely(trans->transaction != fs_info->running_transaction || - trans->transid != fs_info->generation)) { - btrfs_abort_transaction(trans, -EUCLEAN); - btrfs_crit(fs_info, -"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", - parent->start, btrfs_root_id(root), trans->transid, - fs_info->running_transaction->transid, - fs_info->generation); - return -EUCLEAN; - } - - parent_nritems = btrfs_header_nritems(parent); - blocksize = fs_info->nodesize; - end_slot = parent_nritems - 1; - - if (parent_nritems <= 1) - return 0; - - for (i = start_slot; i <= end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_read_node_slot(parent, i); - if (IS_ERR(cur)) - return PTR_ERR(cur); - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize), - BTRFS_NESTING_COW); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. @@ -968,7 +822,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - unsigned long oip; + const int unit_size = eb->folio_size; + unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; @@ -976,20 +831,20 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, mid = (low + high) / 2; offset = p + mid * item_size; - oip = offset_in_page(offset); + oil = get_eb_offset_in_folio(eb, offset); - if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = get_eb_page_index(offset); - char *kaddr = page_address(eb->pages[idx]); + if (oil + key_size <= unit_size) { + const unsigned long idx = get_eb_folio_index(eb, offset); + char *kaddr = folio_address(eb->folios[idx]); - oip = get_eb_offset_in_page(eb, offset); - tmp = (struct btrfs_disk_key *)(kaddr + oip); + oil = get_eb_offset_in_folio(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; } - ret = comp_keys(tmp, key); + ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; @@ -1004,19 +859,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, return 1; } -static void root_add_used(struct btrfs_root *root, u32 size) +static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); + btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } -static void root_sub_used(struct btrfs_root *root, u32 size) +static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); + btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } @@ -1132,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); - root_sub_used(root, mid->len); + root_sub_used_bytes(root); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); @@ -1154,7 +1009,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1172,7 +1027,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -1206,9 +1061,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; goto out; } - root_sub_used(root, right->len); - ret = btrfs_free_tree_block(trans, btrfs_root_id(root), right, - 0, 1); + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), + right, 0, 1); free_extent_buffer_stale(right); right = NULL; if (ret < 0) { @@ -1268,7 +1123,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = NULL; goto out; } - root_sub_used(root, mid->len); + root_sub_used_bytes(root); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; @@ -1364,7 +1219,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1424,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1670,7 +1525,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, check.has_first_key = true; check.level = parent_level - 1; check.transid = gen; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); /* * If we need to read an extent buffer from disk and we are holding locks @@ -1713,12 +1568,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); - return -EIO; - } - if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EUCLEAN; + return ret; } if (unlock_up) @@ -2038,7 +1888,7 @@ static int search_leaf(struct btrfs_trans_handle *trans, * the extent buffer's header and we have recently accessed * the header's level field. */ - ret = comp_keys(&first_key, key); + ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want @@ -2123,8 +1973,8 @@ static int search_leaf(struct btrfs_trans_handle *trans, } /* - * btrfs_search_slot - look for a key in a tree and perform necessary - * modifications to preserve tree invariants. + * Look for a key in a tree and perform necessary modifications to preserve + * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path @@ -2551,7 +2401,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); + ret = btrfs_comp_keys(&found_key, &orig_key); if (ret == 0) { if (path->slots[0] > 0) { path->slots[0]--; @@ -2566,7 +2416,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) } btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); + ret = btrfs_comp_keys(&found_key, &key); /* * We might have had an item with the previous key in the tree right * before we released our path. And after we released our path, that @@ -2715,8 +2565,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2745,7 +2595,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2757,7 +2607,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2771,7 +2621,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2811,8 +2661,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -3012,7 +2862,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; @@ -3029,13 +2878,13 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NEW_ROOT); + 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); btrfs_set_header_nritems(c, 1); btrfs_set_node_key(c, &lower_key, 0); @@ -3052,6 +2901,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (ret < 0) { int ret2; + btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); if (ret2 < 0) btrfs_abort_transaction(trans, ret2); @@ -3080,8 +2930,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -3177,13 +3027,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &disk_key, level, c->start, 0, - BTRFS_NESTING_SPLIT); + 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); @@ -3435,7 +3285,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -3651,7 +3501,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -3929,14 +3779,14 @@ again: * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just * use BTRFS_NESTING_NEW_ROOT. */ - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, + right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), + &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); if (split == 0) { if (mid <= slot) { @@ -4171,7 +4021,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4263,7 +4113,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4448,6 +4298,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, /* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4616,7 +4470,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - root_sub_used(root, leaf->len); + root_sub_used_bytes(root); atomic_inc(&leaf->refs); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); @@ -5253,9 +5107,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, int __init btrfs_ctree_init(void) { - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); if (!btrfs_path_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f7bb4c34b984..2034d3710833 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,53 +6,26 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include <linux/mm.h> -#include <linux/sched/signal.h> -#include <linux/highmem.h> -#include <linux/fs.h> -#include <linux/rwsem.h> -#include <linux/semaphore.h> -#include <linux/completion.h> -#include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <trace/events/btrfs.h> -#include <asm/unaligned.h> +#include "linux/cleanup.h" #include <linux/pagemap.h> -#include <linux/btrfs.h> -#include <linux/btrfs_tree.h> -#include <linux/workqueue.h> -#include <linux/security.h> -#include <linux/sizes.h> -#include <linux/dynamic_debug.h> +#include <linux/spinlock.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/atomic.h> +#include <linux/xarray.h> #include <linux/refcount.h> -#include <linux/crc32c.h> -#include <linux/iomap.h> -#include <linux/fscrypt.h> -#include "extent-io-tree.h" -#include "extent_io.h" -#include "extent_map.h" -#include "async-thread.h" -#include "block-rsv.h" +#include <uapi/linux/btrfs_tree.h> #include "locking.h" -#include "misc.h" #include "fs.h" +#include "accessors.h" +#include "extent-io-tree.h" +struct extent_buffer; +struct btrfs_block_rsv; struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -struct btrfs_delayed_ref_root; -struct btrfs_space_info; struct btrfs_block_group; -struct btrfs_ordered_sum; -struct btrfs_ref; -struct btrfs_bio; -struct btrfs_ioctl_encoded_io_args; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -112,6 +85,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -218,17 +194,27 @@ struct btrfs_root { atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; + /* + * Protected by the 'log_mutex' lock but can be read without holding + * that lock to avoid unnecessary lock contention, in which case it + * should be read using btrfs_get_root_log_transid() except if it's a + * log tree in which case it can be directly accessed. Updates to this + * field should always use btrfs_set_root_log_transid(), except for log + * trees where the field can be updated directly. + */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; - /* Just be updated when the commit succeeds. */ + /* + * Just be updated when the commit succeeds. Use + * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() + * to access this field. + */ int last_log_commit; pid_t log_start_pid; u64 last_trans; - u32 type; - u64 free_objectid; struct btrfs_key defrag_progress; @@ -239,18 +225,17 @@ struct btrfs_root { struct list_head root_list; - spinlock_t log_extents_lock[2]; - struct list_head logged_list[2]; - - spinlock_t inode_lock; - /* red-black tree that keeps track of in-memory inodes */ - struct rb_root inode_tree; + /* + * Xarray that keeps track of in-memory inodes, protected by the lock + * @inode_lock. + */ + struct xarray inodes; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by @inode_lock. */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -326,6 +311,9 @@ struct btrfs_root { /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; + /* Used in simple quotas, track root during relocation. */ + u64 relocation_src_root; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -352,6 +340,55 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root) return root->root_key.objectid; } +static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) +{ + return READ_ONCE(root->log_transid); +} + +static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) +{ + WRITE_ONCE(root->log_transid, log_transid); +} + +static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_log_commit); +} + +static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) +{ + WRITE_ONCE(root->last_log_commit, commit_id); +} + +static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_trans); +} + +static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid) +{ + WRITE_ONCE(root->last_trans, transid); +} + +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -472,37 +509,12 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) -{ - return crc32c(crc, address, length); -} - -static inline void btrfs_crc32c_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return crc32c((u32)~1, name, len); -} - -/* - * Figure the key offset of an extended inode ref - */ -static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, - int len) -{ - return (u64) crc32c(parent_objectid, name, len); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); @@ -515,13 +527,43 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); + +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + +/* Compare two keys in a memcmp fashion. */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +#endif + int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -538,19 +580,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -571,13 +620,10 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e1475dfdf7a8..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -46,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; @@ -62,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -84,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -108,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -120,35 +117,29 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) - return 0; + if (!need_auto_defrag(fs_info)) + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; + return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; + defrag->transid = btrfs_get_root_last_trans(root); + defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); @@ -158,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* @@ -190,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -199,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -215,27 +205,24 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); + + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); + fs_info->defrag_inodes = RB_ROOT; - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -246,7 +233,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -256,7 +243,7 @@ again: goto cleanup; } - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + inode = btrfs_iget(defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -274,9 +261,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -303,11 +291,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -325,7 +315,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -338,13 +328,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } /* + * Check if two blocks addresses are close, used by defrag. + */ +static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) + return true; + if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) + return true; + return false; +} + +/* + * Go through all the leaves pointed to by a node and reallocate them so that + * disk order is close to key order. + */ +static int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + const u32 blocksize = fs_info->nodesize; + const int end_slot = btrfs_header_nritems(parent) - 1; + u64 search_start = *last_ret; + u64 last_block = 0; + int ret = 0; + bool progress_passed = false; + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + if (btrfs_header_nritems(parent) <= 1) + return 0; + + for (int i = start_slot; i <= end_slot; i++) { + struct extent_buffer *cur; + struct btrfs_disk_key disk_key; + u64 blocknr; + u64 other; + bool close = true; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = true; + blocknr = btrfs_node_blockptr(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + ret = btrfs_force_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return ret; +} + +/* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -461,6 +556,45 @@ done: } /* + * Defrag a given btree. Every leaf in the btree is read and defragmented. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + cond_resched(); + + if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(fs_info)) { + btrfs_debug(fs_info, "defrag_root cancelled"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* * Defrag specific helper to get an extent map. * * Differences between this and btrfs_get_extent() are: @@ -564,8 +698,10 @@ iterate: */ if (key.offset > start) { em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; + em->disk_num_bytes = 0; + em->ram_bytes = 0; + em->offset = 0; em->len = key.offset - start; break; } @@ -627,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } @@ -658,7 +794,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -666,7 +802,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; bool ret = false; @@ -682,9 +818,9 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + if (next->flags & EXTENT_FLAG_PREALLOC) goto out; /* * If the next extent is at its max capacity, defragging current extent @@ -717,20 +853,21 @@ out: * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 page_start = (u64)index << PAGE_SHIFT; u64 page_end = page_start + PAGE_SIZE - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -740,16 +877,16 @@ again: * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -764,17 +901,17 @@ again: if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -783,21 +920,21 @@ again: * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -848,14 +985,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * This is for users who want to convert inline extents to * regular ones through max_inline= mount option. */ - if (em->block_start == EXTENT_MAP_INLINE && + if (em->disk_bytenr == EXTENT_MAP_INLINE && em->len <= inode->root->fs_info->max_inline) goto next; - /* Skip hole/delalloc/preallocated extents */ - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + /* Skip holes and preallocated extents. */ + if (em->disk_bytenr == EXTENT_MAP_HOLE || + (em->flags & EXTENT_FLAG_PREALLOC)) goto next; /* Skip older extent */ @@ -891,8 +1027,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) + if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* @@ -919,7 +1055,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * So if an inline extent passed all above checks, just add it * for defrag, and be converted to regular extents. */ - if (em->block_start == EXTENT_MAP_INLINE) + if (em->disk_bytenr == EXTENT_MAP_INLINE) goto add; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, @@ -1019,7 +1155,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1028,7 +1164,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = folios[0]->index; int ret = 0; int i; @@ -1045,8 +1181,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1062,7 +1198,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; + struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 last_index = (start + len - 1) >> PAGE_SHIFT; u64 start_index = start >> PAGE_SHIFT; @@ -1073,21 +1209,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + folios[i] = defrag_prepare_one_folio(inode, start_index + i); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_pages = i; + goto free_folios; } } for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + folio_wait_writeback(folios[i]); /* Lock the pages range */ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, @@ -1107,7 +1243,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; @@ -1121,14 +1257,12 @@ unlock_extent: unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, &cached_state); -free_pages: +free_folios: for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1174,8 +1308,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1207,7 +1340,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1223,18 +1356,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1264,18 +1398,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, last_byte = round_up(last_byte, fs_info->sectorsize) - 1; /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ @@ -1329,8 +1451,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. @@ -1369,9 +1489,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5305f2283b5e..6b7596c4f0dc 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -3,16 +3,25 @@ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H +#include <linux/types.h> +#include <linux/compiler_types.h> + +struct inode; +struct file_ra_state; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ioctl_defrag_range_args; + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 4a7aefa5f9cf..7aa8a395d838 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -6,9 +6,7 @@ #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" -#include "transaction.h" #include "qgroup.h" -#include "block-group.h" #include "fs.h" /* @@ -113,7 +111,7 @@ * making error handling and cleanup easier. */ -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -350,7 +348,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index c5d573f2366e..3f32953c0a80 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -3,9 +3,13 @@ #ifndef BTRFS_DELALLOC_SPACE_H #define BTRFS_DELALLOC_SPACE_H +#include <linux/types.h> + struct extent_changeset; +struct btrfs_inode; +struct btrfs_fs_info; -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len, bool noflush); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 32c5f5a8a0e9..508bdbae29a0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("btrfs_delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_MEM_SPREAD, - NULL); + delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); if (!delayed_node_cache) return -ENOMEM; return 0; @@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void) kmem_cache_destroy(delayed_node_cache); } +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + static inline void btrfs_init_delayed_node( struct btrfs_delayed_node *delayed_node, struct btrfs_root *root, u64 inode_id) @@ -70,22 +77,22 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return node; } - spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + xa_lock(&root->delayed_nodes); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return node; } /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -93,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -104,10 +111,10 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = NULL; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return node; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); return NULL; } @@ -120,6 +127,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); int ret; + void *ptr; again: node = btrfs_get_delayed_node(btrfs_inode); @@ -131,26 +139,30 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ + /* Cached in the inode and can be accessed. */ refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { + /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ + ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); + if (ret == -ENOMEM) { kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } - - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { - spin_unlock(&root->inode_lock); + xa_lock(&root->delayed_nodes); + ptr = xa_load(&root->delayed_nodes, ino); + if (ptr) { + /* Somebody inserted it, go back and read it. */ + xa_unlock(&root->delayed_nodes); kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); + node = NULL; goto again; } + ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); + ASSERT(xa_err(ptr) != -EINVAL); + ASSERT(xa_err(ptr) != -ENOMEM); + ASSERT(ptr == NULL); btrfs_inode->delayed_node = node; - spin_unlock(&root->inode_lock); - radix_tree_preload_end(); + xa_unlock(&root->delayed_nodes); return node; } @@ -263,15 +275,12 @@ static void __btrfs_release_delayed_node( if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; - spin_lock(&root->inode_lock); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); /* * Once our refcount goes to zero, nobody is allowed to bump it * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -328,7 +337,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, } /* - * __btrfs_lookup_delayed_item - look up the delayed item by key + * Look up the delayed item by key. + * * @delayed_node: pointer to the delayed node * @index: the dir index value to lookup (offset of a dir index key) * @@ -515,7 +525,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed - * node's index_items_size field. + * node's curr_index_batch_size and index_item_leaves fields. */ if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; @@ -1033,14 +1043,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) - goto search; -again: + /* + * Now we're going to delete the INODE_REF/EXTREF, which should be the + * only one ref left. Check if the next item is an INODE_REF/EXTREF. + * + * But if we're the last item already, release and search for the last + * INODE_REF/EXTREF. + */ + if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { + key.objectid = node->inode_id; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = (u64)-1; + + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret > 0); + ASSERT(path->slots[0] > 0); + ret = 0; + path->slots[0]--; + leaf = path->nodes[0]; + } else { + path->slots[0]++; + } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != node->inode_id) goto out; - if (key.type != BTRFS_INODE_REF_KEY && key.type != BTRFS_INODE_EXTREF_KEY) goto out; @@ -1067,22 +1096,6 @@ err_out: btrfs_abort_transaction(trans, ret); return ret; - -search: - btrfs_release_path(path); - - key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = -1; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto err_out; - ASSERT(ret); - - ret = 0; - leaf = path->nodes[0]; - path->slots[0]--; - goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1379,8 +1392,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, - NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1457,7 +1469,7 @@ static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 flags, + const struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1637,7 +1649,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->root_key.objectid, + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); @@ -1670,7 +1682,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) return 0; } -bool btrfs_readdir_get_delayed_items(struct inode *inode, +bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, u64 last_index, struct list_head *ins_list, struct list_head *del_list) @@ -1678,7 +1690,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return false; @@ -1686,8 +1698,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - btrfs_inode_lock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1718,7 +1730,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, return true; } -void btrfs_readdir_put_delayed_items(struct inode *inode, +void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list) { @@ -1740,10 +1752,10 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, * The VFS is going to do up_read(), so we need to downgrade back to a * read lock. */ - downgrade_write(&inode->i_rwsem); + downgrade_write(&inode->vfs_inode.i_rwsem); } -int btrfs_should_delete_dir_index(struct list_head *del_list, +int btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) { struct btrfs_delayed_item *curr; @@ -1761,11 +1773,10 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, } /* - * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree - * + * Read dir info stored in the delayed tree. */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + const struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1835,24 +1846,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) @@ -1892,22 +1901,21 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + btrfs_stack_timespec_nsec(&inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + btrfs_stack_timespec_nsec(&inode_item->mtime)); inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_stack_timespec_nsec(&inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1915,9 +1923,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; int ret = 0; @@ -2041,34 +2049,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { - spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { - spin_unlock(&root->inode_lock); - break; + struct btrfs_delayed_node *node; + int count; + + xa_lock(&root->delayed_nodes); + if (xa_empty(&root->delayed_nodes)) { + xa_unlock(&root->delayed_nodes); + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + count = 0; + xa_for_each_start(&root->delayed_nodes, index, node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&node->refs)) { + delayed_nodes[count] = node; + count++; + } + if (count >= ARRAY_SIZE(delayed_nodes)) + break; } - spin_unlock(&root->inode_lock); + xa_unlock(&root->delayed_nodes); + index++; - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 1da213197f55..7cfefdfe54ea 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -7,15 +7,23 @@ #ifndef BTRFS_DELAYED_INODE_H #define BTRFS_DELAYED_INODE_H +#include <linux/types.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> +#include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> #include "ctree.h" +struct btrfs_disk_key; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_trans_handle; + enum btrfs_delayed_item_type { BTRFS_DELAYED_INSERTION_ITEM, BTRFS_DELAYED_DELETION_ITEM @@ -98,22 +106,11 @@ struct btrfs_delayed_item { char data[] __counted_by(data_len); }; -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - atomic_set(&delayed_root->items_seq, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root); int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 flags, + const struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -135,7 +132,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); @@ -147,17 +143,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); /* Used for readdir() */ -bool btrfs_readdir_get_delayed_items(struct inode *inode, +bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, u64 last_index, struct list_head *ins_list, struct list_head *del_list); -void btrfs_readdir_put_delayed_items(struct inode *inode, +void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list); -int btrfs_should_delete_dir_index(struct list_head *del_list, +int btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + const struct list_head *ins_list); /* Used during directory logging. */ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6f2e48d697dd..cab94d141f66 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -16,8 +16,7 @@ #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; -struct kmem_cache *btrfs_delayed_tree_ref_cachep; -struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees @@ -57,16 +56,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) * Release a ref head's reservation. * * @fs_info: the filesystem - * @nr: number of items to drop + * @nr_refs: number of delayed refs to drop + * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); - u64 released = 0; + u64 num_bytes; + u64 released; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) @@ -77,68 +80,118 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) /* * Adjust the size of the delayed refs rsv. * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. + * This is to be called anytime we may have adjusted trans->delayed_ref_updates + * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and + * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; + u64 reserved_bytes; - if (!trans->delayed_ref_updates) + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, + trans->delayed_ref_csum_deletions); + + if (num_bytes == 0) return; - num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - trans->delayed_ref_updates); + /* + * Try to take num_bytes from the transaction's local delayed reserve. + * If not possible, try to take as much as it's available. If the local + * reserve doesn't have enough reserved space, the delayed refs reserve + * will be refilled next time btrfs_delayed_refs_rsv_refill() is called + * by someone or if a transaction commit is triggered before that, the + * global block reserve will be used. We want to minimize using the + * global block reserve for cases we can account for in advance, to + * avoid exhausting it and reach -ENOSPC during a transaction commit. + */ + spin_lock(&local_rsv->lock); + reserved_bytes = min(num_bytes, local_rsv->reserved); + local_rsv->reserved -= reserved_bytes; + local_rsv->full = (local_rsv->reserved >= local_rsv->size); + spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = false; + delayed_rsv->reserved += reserved_bytes; + delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; + trans->delayed_ref_csum_deletions = 0; } /* - * Transfer bytes to our delayed refs rsv. - * - * @fs_info: the filesystem - * @num_bytes: number of bytes to transfer - * - * This transfers up to the num_bytes amount, previously reserved, to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. + * Adjust the size of the delayed refs block reserve for 1 block group item + * insertion, used after allocating a block group. */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - u64 num_bytes) +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } + spin_lock(&delayed_rsv->lock); + /* + * Inserting a block group item does not require changing the free space + * tree, only the extent tree or the block group tree, so this is all we + * need. + */ + delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = true; - spin_unlock(&delayed_refs_rsv->lock); +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item insertion. + */ +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * update. + */ +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Updating a block group item does not result in new nodes/leaves and + * does not require changing the free space tree, only the extent tree + * or the block group tree, so this is all we need. + */ + delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} - if (num_bytes) +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item update. + */ +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - btrfs_space_info_free_bytes_may_use(fs_info, - delayed_refs_rsv->space_info, to_free); + 0, released, 0); } /* @@ -154,6 +207,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; u64 refilled_bytes; @@ -170,7 +224,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; @@ -199,8 +253,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info, - to_free); + btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -209,50 +262,19 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, } /* - * compare two delayed tree backrefs with same bytenr and type - */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, - struct btrfs_delayed_tree_ref *ref2) -{ - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, - struct btrfs_delayed_data_ref *ref2) +static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2) { - if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->objectid < ref2->objectid) - return -1; - if (ref1->objectid > ref2->objectid) - return 1; - if (ref1->offset < ref2->offset) - return -1; - if (ref1->offset > ref2->offset) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } + if (ref1->data_ref.objectid < ref2->data_ref.objectid) + return -1; + if (ref1->data_ref.objectid > ref2->data_ref.objectid) + return 1; + if (ref1->data_ref.offset < ref2->data_ref.offset) + return -1; + if (ref1->data_ref.offset > ref2->data_ref.offset) + return 1; return 0; } @@ -266,13 +288,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return -1; if (ref1->type > ref2->type) return 1; - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), - btrfs_delayed_node_to_tree_ref(ref2)); - else - ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), - btrfs_delayed_node_to_data_ref(ref2)); + if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } else { + if (ref1->ref_root < ref2->ref_root) + return -1; + if (ref1->ref_root > ref2->ref_root) + return 1; + if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) + ret = comp_data_refs(ref1, ref2); + } if (ret) return ret; if (check_seq) { @@ -422,7 +451,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -433,9 +463,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, list_del(&ref->add_list); btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } -static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -464,10 +496,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, mod = -next->ref_mod; } - drop_delayed_ref(delayed_refs, head, next); + drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(delayed_refs, head, ref); + drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* @@ -505,7 +537,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(delayed_refs, head, ref, seq)) + if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } @@ -584,10 +616,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ -static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, +static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { + struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; @@ -598,6 +631,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); spin_unlock(&href->lock); + trans->delayed_ref_updates++; return false; } @@ -626,7 +660,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(root, href, exist); + drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } @@ -647,6 +681,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); + + /* + * When freeing an extent, we may not know the owning root when we + * first create the head_ref. However, some deref before the last deref + * will know it, so we just need to update the head_ref accordingly. + */ + if (!existing->owning_root) + existing->owning_root = update->owning_root; + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -656,6 +699,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; + existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting @@ -695,6 +739,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. + * We reserve bytes for csum deletion when adding or updating a ref head + * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = @@ -703,11 +749,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; - trans->delayed_ref_updates += csum_leaves; + trans->delayed_ref_csum_deletions += csum_leaves; } } @@ -715,18 +761,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, u64 ref_root, - u64 reserved, int action, bool is_data, - bool is_system) + u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ - BUG_ON(!is_data && reserved); + BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); - switch (action) { + switch (generic_ref->action) { + case BTRFS_ADD_DELAYED_REF: + /* count_mod is already set to 1. */ + break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; @@ -755,12 +803,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } refcount_set(&head_ref->refs, 1); - head_ref->bytenr = bytenr; - head_ref->num_bytes = num_bytes; + head_ref->bytenr = generic_ref->bytenr; + head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; + head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; - head_ref->is_data = is_data; - head_ref->is_system = is_system; + head_ref->owning_root = generic_ref->owning_root; + head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); + head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); @@ -769,13 +819,19 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); + /* If not metadata set an impossible level to help debugging. */ + if (generic_ref->type == BTRFS_REF_METADATA) + head_ref->level = generic_ref->tree_ref.level; + else + head_ref->level = U8_MAX; + if (qrecord) { - if (ref_root && reserved) { + if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; - qrecord->data_rsv_refroot = ref_root; + qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; + qrecord->bytenr = generic_ref->bytenr; + qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } @@ -784,6 +840,8 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. + * + * Returns an error pointer in case of an error. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, @@ -791,6 +849,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; bool qrecord_inserted = false; @@ -799,14 +858,23 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, + qrecord->bytenr >> fs_info->sectorsize_bits); + /* Caller responsible for freeing qrecord on error. */ + if (ret < 0) + return ERR_PTR(ret); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -819,16 +887,20 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + /* + * We reserve the amount of bytes needed to delete csums when + * adding the ref head and not when adding individual drop refs + * since the csum items are deleted only after running the last + * delayed drop ref (the data extent's ref count drops to 0). + */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; - trans->delayed_ref_updates += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + trans->delayed_ref_csum_deletions += + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -837,8 +909,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a - * modification to a an extent. + * Initialize the structure which represents a modification to a an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -863,90 +934,111 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 ref_root, - int action, u8 ref_type) + struct btrfs_ref *generic_ref) { + int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(ref_root)) + if (is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; + ref->bytenr = generic_ref->bytenr; + ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; - ref->type = ref_type; + ref->type = btrfs_ref_type(generic_ref); + ref->ref_root = generic_ref->ref_root; + ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); + + if (generic_ref->type == BTRFS_REF_DATA) + ref->data_ref = generic_ref->data_ref; + else + ref->tree_ref = generic_ref->tree_ref; } -/* - * add a delayed tree ref. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op) +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: generic_ref->ref_root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, + u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: generic_ref->ref_root; +#endif + generic_ref->data_ref.objectid = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + +static int add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op, + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; bool qrecord_inserted; - bool is_system; - bool merged; int action = generic_ref->action; - int level = generic_ref->tree_ref.level; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u8 ref_type; - - is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); + bool merged; + int ret; - ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); - if (!ref) + node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); + if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + ret = -ENOMEM; + goto free_node; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; + ret = -ENOMEM; + goto free_head_ref; + } + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, + generic_ref->bytenr >> fs_info->sectorsize_bits, + GFP_NOFS)) { + ret = -ENOMEM; + goto free_record; } } - if (parent) - ref_type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref_type = BTRFS_TREE_BLOCK_REF_KEY; - - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, action, - ref_type); - ref->root = generic_ref->tree_ref.owning_root; - ref->parent = parent; - ref->level = level; - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, 0, action, - false, is_system); + init_delayed_ref_common(fs_info, node, generic_ref); + init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -956,10 +1048,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); + new_head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + if (IS_ERR(new_head_ref)) { + spin_unlock(&delayed_refs->lock); + ret = PTR_ERR(new_head_ref); + goto free_record; + } + head_ref = new_head_ref; - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* @@ -968,16 +1066,36 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ btrfs_update_delayed_refs_rsv(trans); - trace_add_delayed_tree_ref(fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); + if (generic_ref->type == BTRFS_REF_DATA) + trace_add_delayed_data_ref(trans->fs_info, node); + else + trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - btrfs_qgroup_trace_extent_post(trans, record); - + return btrfs_qgroup_trace_extent_post(trans, record); return 0; + +free_record: + kfree(record); +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return ret; +} + +/* + * Add a delayed tree ref. This does all of the accounting required to make sure + * the delayed ref is eventually processed before this transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op) +{ + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, extent_op, 0); } /* @@ -987,114 +1105,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_data_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *record = NULL; - bool qrecord_inserted; - int action = generic_ref->action; - bool merged; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.owning_root; - u64 owner = generic_ref->data_ref.ino; - u64 offset = generic_ref->data_ref.offset; - u8 ref_type; - - ASSERT(generic_ref->type == BTRFS_REF_DATA && action); - ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); - if (!ref) - return -ENOMEM; - - if (parent) - ref_type = BTRFS_SHARED_DATA_REF_KEY; - else - ref_type = BTRFS_EXTENT_DATA_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); - ref->root = ref_root; - ref->parent = parent; - ref->objectid = owner; - ref->offset = offset; - - - head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - return -ENOMEM; - } - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { - record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - kmem_cache_free(btrfs_delayed_ref_head_cachep, - head_ref); - return -ENOMEM; - } - } - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false); - head_ref->extent_op = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); - - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); - spin_unlock(&delayed_refs->lock); - - /* - * Need to update the delayed_refs_rsv with any changes we may have - * made. - */ - btrfs_update_delayed_refs_rsv(trans); - - trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); - if (merged) - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - - - if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); - return 0; + ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, + u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_ref generic_ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_UPDATE_DELAYED_HEAD, + .bytenr = bytenr, + .num_bytes = num_bytes, + .tree_ref.level = level, + }; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; - init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false); + init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL); - + head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, + BTRFS_UPDATE_DELAYED_HEAD, NULL); spin_unlock(&delayed_refs->lock); + if (IS_ERR(head_ref_ret)) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return PTR_ERR(head_ref_ret); + } + /* * Need to update the delayed_refs_rsv with any changes we may have * made. @@ -1103,6 +1151,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return 0; } +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); + } +} + /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. @@ -1115,41 +1171,91 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt return find_ref_head(delayed_refs, bytenr, false); } +static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) +{ + int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; + + if (type < entry->type) + return -1; + if (type > entry->type) + return 1; + + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (root < entry->ref_root) + return -1; + if (root > entry->ref_root) + return 1; + } else { + if (parent < entry->parent) + return -1; + if (parent > entry->parent) + return 1; + } + return 0; +} + +/* + * Check to see if a given root/parent reference is attached to the head. This + * only checks for BTRFS_ADD_DELAYED_REF references that match, as that + * indicates the reference exists for the given root or parent. This is for + * tree blocks only. + * + * @head: the head of the bytenr we're searching. + * @root: the root objectid of the reference if it is a normal reference. + * @parent: the parent if this is a shared backref. + */ +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent) +{ + struct rb_node *node; + bool found = false; + + lockdep_assert_held(&head->mutex); + + spin_lock(&head->lock); + node = head->ref_tree.rb_root.rb_node; + while (node) { + struct btrfs_delayed_ref_node *entry; + int ret; + + entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + ret = find_comp(entry, root, parent); + if (ret < 0) { + node = node->rb_left; + } else if (ret > 0) { + node = node->rb_right; + } else { + /* + * We only want to count ADD actions, as drops mean the + * ref doesn't exist. + */ + if (entry->action == BTRFS_ADD_DELAYED_REF) + found = true; + break; + } + } + spin_unlock(&head->lock); + return found; +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); - kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); - kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int __init btrfs_delayed_ref_init(void) { - btrfs_delayed_ref_head_cachep = kmem_cache_create( - "btrfs_delayed_ref_head", - sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = kmem_cache_create( - "btrfs_delayed_tree_ref", - sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_delayed_tree_ref_cachep) - goto fail; - - btrfs_delayed_data_ref_cachep = kmem_cache_create( - "btrfs_delayed_data_ref", - sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_delayed_data_ref_cachep) + btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); + if (!btrfs_delayed_ref_node_cachep) goto fail; - btrfs_delayed_extent_op_cachep = kmem_cache_create( - "btrfs_delayed_extent_op", - sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index fd9bf2b709c0..352921e76c74 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -6,13 +6,55 @@ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H +#include <linux/types.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs_tree.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +enum btrfs_delayed_ref_action { + /* Add one backref to the tree */ + BTRFS_ADD_DELAYED_REF = 1, + /* Delete one backref from the tree */ + BTRFS_DROP_DELAYED_REF, + /* Record a full extent allocation */ + BTRFS_ADD_DELAYED_EXTENT, + /* Not changing ref count on head ref */ + BTRFS_UPDATE_DELAYED_HEAD, +} __packed; + +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Inode which refers to this data extent */ + u64 objectid; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block. + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* For non-skinny metadata, no special member needed */ +}; struct btrfs_delayed_ref_node { struct rb_node ref_node; @@ -32,6 +74,15 @@ struct btrfs_delayed_ref_node { /* seq number to keep track of insertion order */ u64 seq; + /* The ref_root for this ref */ + u64 ref_root; + + /* + * The parent for this ref, if this isn't set the ref_root is the + * reference owner. + */ + u64 parent; + /* ref count on this data structure */ refcount_t refs; @@ -48,11 +99,15 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + + union { + struct btrfs_tree_ref tree_ref; + struct btrfs_data_ref data_ref; + }; }; struct btrfs_delayed_extent_op { struct btrfs_disk_key key; - u8 level; bool update_key; bool update_flags; u64 flags_to_set; @@ -105,6 +160,21 @@ struct btrfs_delayed_ref_head { int ref_mod; /* + * The root that triggered the allocation when must_insert_reserved is + * set to true. + */ + u64 owning_root; + + /* + * Track reserved bytes when setting must_insert_reserved. On success + * or cleanup, we will need to free the reservation. + */ + u64 reserved_bytes; + + /* Tree block level, for metadata only. */ + u8 level; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -117,26 +187,12 @@ struct btrfs_delayed_ref_head { * the free has happened. */ bool must_insert_reserved; + bool is_data; bool is_system; bool processing; }; -struct btrfs_delayed_tree_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - int level; -}; - -struct btrfs_delayed_data_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - u64 objectid; - u64 offset; -}; - enum btrfs_delayed_ref_flags { /* Indicate that we are flushing delayed refs for the commit */ BTRFS_DELAYED_REFS_FLUSHING, @@ -146,8 +202,16 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + */ + struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -183,47 +247,11 @@ enum btrfs_ref_type { BTRFS_REF_DATA, BTRFS_REF_METADATA, BTRFS_REF_LAST, -}; - -struct btrfs_data_ref { - /* For EXTENT_DATA_REF */ - - /* Original root this data extent belongs to */ - u64 owning_root; - - /* Inode which refers to this data extent */ - u64 ino; - - /* - * file_offset - extent_offset - * - * file_offset is the key.offset of the EXTENT_DATA key. - * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. - */ - u64 offset; -}; - -struct btrfs_tree_ref { - /* - * Level of this tree block - * - * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. - */ - int level; - - /* - * Root which owns this tree block. - * - * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) - */ - u64 owning_root; - - /* For non-skinny metadata, no special member needed */ -}; +} __packed; struct btrfs_ref { enum btrfs_ref_type type; - int action; + enum btrfs_delayed_ref_action action; /* * Whether this extent should go through qgroup record. @@ -238,7 +266,14 @@ struct btrfs_ref { u64 real_root; #endif u64 bytenr; - u64 len; + u64 num_bytes; + u64 owning_root; + + /* + * The root that owns the reference for this reference, this will be set + * or ->parent will be set, depending on what type of reference this is. + */ + u64 ref_root; /* Bytenr of the parent tree block */ u64 parent; @@ -249,8 +284,7 @@ struct btrfs_ref { }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; -extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; -extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_ref_node_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); @@ -277,51 +311,21 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in return num_bytes; } -static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, u64 parent) +static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, + int num_csum_items) { - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; + /* + * Deleting csum items does not result in new nodes/leaves and does not + * require changing the free space tree, only the csum tree, so this is + * all we need. + */ + return btrfs_calc_metadata_size(fs_info, num_csum_items); } -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root, u64 mod_root, bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; -#endif - generic_ref->tree_ref.level = level; - generic_ref->tree_ref.owning_root = root; - generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; - -} - -static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset, u64 mod_root, - bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; -#endif - generic_ref->data_ref.owning_root = ref_root; - generic_ref->data_ref.ino = ino; - generic_ref->data_ref.offset = offset; - generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; -} +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, + u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -336,25 +340,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - WARN_ON(refcount_read(&ref->refs) == 0); - if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } - } -} +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) @@ -379,7 +365,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, + u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, @@ -402,27 +388,51 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent); -/* - * helper functions to cast a node into its container - */ -static inline struct btrfs_delayed_tree_ref * -btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { - return container_of(node, struct btrfs_delayed_tree_ref, node); + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.objectid; + return node->tree_ref.level; } -static inline struct btrfs_delayed_data_ref * -btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) { - return container_of(node, struct btrfs_delayed_data_ref, node); + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.offset; + return 0; +} + +static inline u8 btrfs_ref_type(struct btrfs_ref *ref) +{ + ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); + + if (ref->type == BTRFS_REF_DATA) { + if (ref->parent) + return BTRFS_SHARED_DATA_REF_KEY; + else + return BTRFS_EXTENT_DATA_REF_KEY; + } else { + if (ref->parent) + return BTRFS_SHARED_BLOCK_REF_KEY; + else + return BTRFS_TREE_BLOCK_REF_KEY; + } + + return 0; } #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f77ef719a3b1..604399e59a3d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -11,13 +11,10 @@ #include <linux/math64.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -247,6 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + struct file *bdev_file; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,12 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) { + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); + return PTR_ERR(bdev_file); } + bdev = file_bdev(bdev_file); if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -313,11 +312,11 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; + device->bdev_file = bdev_file; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); @@ -334,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, fs_info->bdev_holder); + fput(bdev_file); return ret; } @@ -549,8 +548,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, u64 physical) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 chunk_offset = cache->start; int num_extents, cur_extent; int i; @@ -566,9 +564,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } spin_unlock(&cache->lock); - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - ASSERT(!IS_ERR(em)); - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(map)); num_extents = 0; cur_extent = 0; @@ -582,7 +579,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, cur_extent = i; } - free_extent_map(em); + btrfs_free_chunk_map(map); if (num_extents > 1 && cur_extent < num_extents - 1) { /* @@ -688,7 +685,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); /* * Commit dev_replace state and reserve 1 item for it. @@ -828,25 +825,46 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; + struct rb_node *node; - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + write_lock(&fs_info->mapping_tree_lock); + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); + struct btrfs_chunk_map *map; + u64 next_start; + + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } + write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -886,7 +904,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); /* * We have to use this loop approach because at this point src_device @@ -1005,8 +1023,7 @@ error: btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) - btrfs_scratch_superblocks(fs_info, src_device->bdev, - src_device->name->str); + btrfs_scratch_superblocks(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 675082ccec89..23e480efe5e6 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -6,11 +6,15 @@ #ifndef BTRFS_DEV_REPLACE_H #define BTRFS_DEV_REPLACE_H +#include <linux/types.h> +#include <linux/compiler_types.h> + struct btrfs_ioctl_dev_replace_args; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_dev_replace; struct btrfs_block_group; +struct btrfs_device; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 7066414be6ee..1e8cd7c9472e 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -22,7 +22,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, + const struct btrfs_key *cpu_key, u32 data_size, const char *name, int name_len) @@ -108,7 +108,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index) + const struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; @@ -379,7 +379,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * for a specific name. */ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, + const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; @@ -417,7 +417,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_dir_item *di) + const struct btrfs_dir_item *di) { struct extent_buffer *leaf; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index aab4b7cc7fa0..5f6dfafc91f1 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,11 +3,21 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/types.h> +#include <linux/crc32c.h> + +struct fscrypt_str; +struct btrfs_fs_info; +struct btrfs_key; +struct btrfs_path; +struct btrfs_root; +struct btrfs_trans_handle; + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); + const struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -23,7 +33,7 @@ struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_dir_item *di); + const struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -35,8 +45,13 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, + const struct btrfs_path *path, const char *name, int name_len); +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + #endif diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c new file mode 100644 index 000000000000..71984d7db839 --- /dev/null +++ b/fs/btrfs/direct-io.c @@ -0,0 +1,1079 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/fsverity.h> +#include <linux/iomap.h> +#include "ctree.h" +#include "delalloc-space.h" +#include "direct-io.h" +#include "extent-tree.h" +#include "file.h" +#include "fs.h" +#include "transaction.h" +#include "volumes.h" + +struct btrfs_dio_data { + ssize_t submitted; + struct extent_changeset *data_reserved; + struct btrfs_ordered_extent *ordered; + bool data_space_reserved; + bool nocow_done; +}; + +struct btrfs_dio_private { + /* Range of I/O */ + u64 file_offset; + u32 bytes; + + /* This must be last */ + struct btrfs_bio bbio; +}; + +static struct bio_set btrfs_dio_bioset; + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, + unsigned int iomap_flags) +{ + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + int ret = 0; + + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + + while (1) { + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { + ret = -EAGAIN; + break; + } + } else { + lock_extent(io_tree, lockstart, lockend, cached_state); + } + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure there's no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && + (!writing || !filemap_range_has_page(inode->i_mapping, + lockstart, lockend))) + break; + + unlock_extent(io_tree, lockstart, lockend, cached_state); + + if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } + /* + * If we are doing a DIO read and the ordered extent we + * found is for a buffered write, we can not wait for it + * to complete and retry, because if we do so we can + * deadlock with concurrent buffered writes on page + * locks. This happens only if our DIO read covers more + * than one extent map, if at this point has already + * created an ordered extent for a previous extent map + * and locked its range in the inode's io tree, and a + * concurrent write against that previous extent map's + * range and this range started (we unlock the ranges + * in the io tree only when the bios complete and + * buffered writes always lock pages before attempting + * to lock range in the io tree). + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) + btrfs_start_ordered_extent(ordered); + else + ret = nowait ? -EAGAIN : -ENOTBLK; + btrfs_put_ordered_extent(ordered); + } else { + /* + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readahead (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readahead wait for that + * ordered extent to complete while holding a lock on + * that page. + */ + ret = nowait ? -EAGAIN : -ENOTBLK; + } + + if (ret) + break; + + cond_resched(); + } + + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + return ret; +} + +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + const u64 start, + const struct btrfs_file_extent *file_extent, + const int type) +{ + struct extent_map *em = NULL; + struct btrfs_ordered_extent *ordered; + + if (type != BTRFS_ORDERED_NOCOW) { + em = btrfs_create_io_em(inode, start, file_extent, type); + if (IS_ERR(em)) + goto out; + } + + ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); + if (IS_ERR(ordered)) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_map_range(inode, start, + start + file_extent->num_bytes - 1, false); + } + em = ERR_CAST(ordered); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; + } + out: + + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 len) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_file_extent file_extent; + struct extent_map *em; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); +again: + ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, + 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } + if (ret) + return ERR_PTR(ret); + + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, + BTRFS_ORDERED_REGULAR); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + if (IS_ERR(em)) + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); + + return em; +} + +static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 *lenp, + unsigned int iomap_flags) +{ + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_file_extent file_extent; + struct extent_map *em = *map; + int type; + u64 block_start; + struct btrfs_block_group *bg; + bool can_nocow = false; + bool space_reserved = false; + u64 len = *lenp; + u64 prev_len; + int ret = 0; + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if ((em->flags & EXTENT_FLAG_PREALLOC) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->disk_bytenr != EXTENT_MAP_HOLE)) { + if (em->flags & EXTENT_FLAG_PREALLOC) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = extent_map_block_start(em) + (start - em->start); + + if (can_nocow_extent(inode, start, &len, + &file_extent, false, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } + } + + prev_len = len; + if (can_nocow) { + struct extent_map *em2; + + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; + goto out; + } + space_reserved = true; + + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, + &file_extent, type); + btrfs_dec_nocow_writers(bg); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em2; + em = em2; + } + + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + + dio_data->nocow_done = true; + } else { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + if (nowait) { + ret = -EAGAIN; + goto out; + } + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) { + ret = -ENOSPC; + goto out; + } + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); + } + + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); +out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } + *lenp = len; + return ret; +} + +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = iter->private; + u64 lockstart, lockend; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + u64 len = length; + const u64 data_alloc_len = length; + u32 unlock_bits = EXTENT_LOCKED; + + /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ + if (!write) + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + + lockstart = start; + lockend = start + len - 1; + + /* + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } + } + + memset(dio_data, 0, sizeof(*dio_data)); + + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len, false); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. + */ + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) + goto err; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because that's what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + free_extent_map(em); + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; + goto unlock_err; + } + + len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, &len, flags); + if (ret < 0) + goto unlock_err; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } + } + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->disk_bytenr == EXTENT_MAP_HOLE) || + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; + iomap->length = len; + free_extent_map(em); + + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + + return 0; + +unlock_err: + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); +err: + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } + + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); + return 0; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); + else + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); + ret = -ENOTBLK; + } + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } + + if (write) + extent_changeset_free(dio_data->data_reserved); + return ret; +} + +static void btrfs_dio_end_io(struct btrfs_bio *bbio) +{ + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; + struct bio *bio = &bbio->bio; + + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); + } + + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); +} + +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) +{ + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_ordered_extent *new; + int ret; + + /* Must always be called for the beginning of an ordered extent. */ + if (WARN_ON_ONCE(start != ordered->disk_bytenr)) + return -EINVAL; + + /* No need to split if the ordered extent covers the entire bio. */ + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; + return 0; + } + + /* + * Don't split the extent_map for NOCOW extents, as we're writing into + * a pre-existing one. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); + if (ret) + return ret; + } + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + bbio->ordered = new; + return 0; +} + +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_dio_data *dio_data = iter->private; + + btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_dio_end_io, bio->bi_private); + bbio->inode = BTRFS_I(iter->inode); + bbio->file_offset = file_offset; + + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; + + dio_data->submitted += bio->bi_iter.bi_size; + + /* + * Check if we are doing a partial write. If we are, we need to split + * the ordered extent to match the submitted bio. Hang on to the + * remaining unfinishable ordered_extent in dio_data so that it can be + * cancelled in iomap_end to avoid a deadlock wherein faulting the + * remaining pages is blocked on the outstanding ordered extent. + */ + if (iter->flags & IOMAP_WRITE) { + int ret; + + ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); + if (ret) { + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); + return; + } + } + + btrfs_submit_bbio(bbio, 0); +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_dio_submit_io, + .bio_set = &btrfs_dio_bioset, +}; + +static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data = { 0 }; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const u32 blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + loff_t pos; + ssize_t written = 0; + ssize_t written_buffered; + size_t prev_left = 0; + loff_t endbyte; + ssize_t ret; + unsigned int ilock_flags = 0; + struct iomap_dio *dio; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) + ilock_flags |= BTRFS_ILOCK_SHARED; + +relock: + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; + + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + return ret; + } + + ret = btrfs_write_check(iocb, from, ret); + if (ret < 0) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto out; + } + + pos = iocb->ki_pos; + /* + * Re-check since file size may have changed just before taking the + * lock or pos may have changed because of O_APPEND in generic_write_check() + */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && + pos + iov_iter_count(from) > i_size_read(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + + if (check_direct_IO(fs_info, from, pos)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } + + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ +again: + from->nofault = true; + dio = btrfs_dio_write(iocb, from, written); + from->nofault = false; + + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + } else { + /* + * If we have a synchronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't + * deadlock on the inode lock - we are already holding it and we + * can't call it after unlocking because we may need to complete + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; + ret = iomap_dio_complete(dio); + current->journal_info = NULL; + } + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + written = ret; + + if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + ret = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto again; + } + } + + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + + /* + * If 'ret' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. + */ + if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) + goto out; + +buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + + pos = iocb->ki_pos; + written_buffered = btrfs_buffered_write(iocb, from); + if (written_buffered < 0) { + ret = written_buffered; + goto out; + } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ + endbyte = pos + written_buffered - 1; + ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); + if (ret) + goto out; + ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (ret) + goto out; + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); +out: + return ret < 0 ? ret : written; +} + +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + if (!iter_is_iovec(iter)) + return 0; + + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) + return -EINVAL; + } + } + return 0; +} + +ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; + ssize_t ret; + + if (fsverity_active(inode)) + return 0; + + if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) + return 0; + + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; + ret = btrfs_dio_read(iocb, to, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + return ret < 0 ? ret : read; +} + +int __init btrfs_init_dio(void) +{ + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bbio.bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + + return 0; +} + +void __cold btrfs_destroy_dio(void) +{ + bioset_exit(&btrfs_dio_bioset); +} diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h new file mode 100644 index 000000000000..3dc3ea926afe --- /dev/null +++ b/fs/btrfs/direct-io.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIRECT_IO_H +#define BTRFS_DIRECT_IO_H + +#include <linux/types.h> + +int __init btrfs_init_dio(void); +void __cold btrfs_destroy_dio(void); + +ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from); +ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to); + +#endif /* BTRFS_DIRECT_IO_H */ diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..de23c4b3515e 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -250,6 +245,18 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); @@ -260,9 +267,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +501,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) + return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +566,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8ec411eb9c9b..e655fa3bfd9b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,7 +17,7 @@ #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" @@ -29,8 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "check-integrity.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -75,20 +73,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; - const int num_pages = num_extent_pages(buf); - const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + int num_pages; + u32 first_page_part; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); + + if (buf->addr) { + /* Pages are contiguous, handle them as a big one. */ + kaddr = buf->addr; + first_page_part = fs_info->nodesize; + num_pages = 1; + } else { + kaddr = folio_address(buf->folios[0]); + first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + num_pages = num_extent_pages(buf); + } + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); + /* + * Multiple single-page folios case would reach here. + * + * nodesize <= PAGE_SIZE and large folio all handled by above + * crypto_shash_update() already. + */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { - kaddr = page_address(buf->pages[i]); + kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); @@ -167,20 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages = num_extent_pages(eb); + int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - u64 start = max_t(u64, eb->start, page_offset(p)); - u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + u64 start = max_t(u64, eb->start, folio_pos(folio)); + u64 end = min_t(u64, eb->start + eb->len, + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, p, offset_in_page(start), mirror_num); + start, folio, offset_in_folio(folio, start), + mirror_num); if (ret) break; } @@ -196,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; @@ -245,6 +262,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); + u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; @@ -254,15 +272,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON_ONCE(found_start != 0); + /* + * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't + * checksum it but zero-out its content. This is done to preserve + * ordering of I/O without unnecessarily writing out data. + */ + if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { + memzero_extent_buffer(eb, 0, eb->len); return BLK_STS_OK; } if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, - eb->len))) + if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], + eb->start, eb->len))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -282,12 +305,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ - if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", - eb->start, btrfs_header_generation(eb), - fs_info->last_trans_committed); + eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); @@ -318,9 +341,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) BTRFS_FSID_SIZE); /* - * alloc_fs_devices() copies the fsid into metadata_uuid if the - * metadata_uuid is unset in the superblock, including for a seed device. - * So, we can use fs_devices->metadata_uuid. + * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. + * This is then overwritten by metadata_uuid if it is present in the + * device_list_add(). The same true for a seed device as well. So use of + * fs_devices::metadata_uuid is appropriate here. */ if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; @@ -334,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -343,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; + const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); @@ -370,18 +395,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, } csum_tree_block(eb, result); - header_csum = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); + header_csum = folio_address(eb->folios[0]) + + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), - btrfs_header_level(eb)); - ret = -EUCLEAN; - goto out; + btrfs_header_level(eb), + ignore_csum ? ", ignored" : ""); + if (!ignore_csum) { + ret = -EUCLEAN; + goto out; + } } if (found_level != check->level) { @@ -401,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } if (check->has_first_key) { - struct btrfs_key *expect_key = &check->first_key; + const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) @@ -473,15 +501,15 @@ static int btree_migrate_folio(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; - fs_info = BTRFS_I(mapping->host)->root->fs_info; + fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, @@ -497,18 +525,19 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(folio->mapping->host)->io_tree; + + tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { - btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); @@ -519,7 +548,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; @@ -610,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, check->owner_root)) { - free_extent_buffer_stale(buf); - return ERR_PTR(-EUCLEAN); - } return buf; } @@ -621,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { - bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + bool dummy = btrfs_is_testing(fs_info); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -633,12 +658,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; RB_CLEAR_NODE(&root->rb_node); - root->last_trans = 0; + btrfs_set_root_last_trans(root, 0); root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; - root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + xa_init(&root->inodes); + xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); @@ -649,14 +674,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - INIT_LIST_HEAD(&root->logged_list[0]); - INIT_LIST_HEAD(&root->logged_list[1]); - spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); - spin_lock_init(&root->log_extents_lock[0]); - spin_lock_init(&root->log_extents_lock[1]); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); @@ -675,9 +695,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, @@ -754,7 +774,7 @@ int btrfs_global_root_insert(struct btrfs_root *root) if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", - root->root_key.objectid, root->root_key.offset); + btrfs_root_id(root), root->root_key.offset); } return ret; } @@ -826,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -859,7 +872,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -936,7 +949,7 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); + NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); @@ -989,8 +1002,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return ret; } - log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; + btrfs_set_root_last_trans(log_root, trans->transid); + log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1004,15 +1017,15 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); return 0; } static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; @@ -1054,15 +1067,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - root->root_key.objectid != btrfs_header_owner(root->node)) { + if (!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", - root->root_key.objectid, root->node->start, + btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), - root->root_key.objectid); + btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } @@ -1074,7 +1087,7 @@ fail: } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_path *path; @@ -1099,9 +1112,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(root->root_key.objectid)) { + is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1110,7 +1123,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(root->root_key.objectid) && + if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1179,6 +1192,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_RAID_STRIPE_TREE_OBJECTID: + return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } @@ -1195,7 +1210,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); @@ -1207,7 +1222,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; @@ -1220,6 +1235,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -1241,9 +1257,14 @@ static void free_global_roots(struct btrfs_fs_info *fs_info) void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); + if (percpu_counter_initialized(em_counter)) + ASSERT(percpu_counter_sum_positive(em_counter) == 0); + percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -1259,11 +1280,11 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); + btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1411,7 +1432,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* - * btrfs_get_fs_root_commit_root - return a root for the given objectid + * Return a root for the given objectid. + * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * @@ -1708,11 +1730,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * read_backup_root - Reads a backup root based on the passed priority. Prio 0 - * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * Reads a backup root based on the passed priority. Prio 0 is the newest, prio + * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * - * fs_info - filesystem whose backup roots need to be read - * priority - priority of backup root required + * @fs_info: filesystem whose backup roots need to be read + * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ @@ -1812,6 +1834,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); + free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -1822,7 +1845,8 @@ void btrfs_put_root(struct btrfs_root *root) return; if (refcount_dec_and_test(&root->refs)) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (WARN_ON(!xa_empty(&root->inodes))) + xa_destroy(&root->inodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1896,7 +1920,7 @@ static int btrfs_init_btree_inode(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. @@ -1907,15 +1931,11 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; - BTRFS_I(inode)->location.type = 0; - BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); fs_info->btree_inode = inode; @@ -1939,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2114,7 +2134,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } @@ -2148,8 +2168,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -2167,7 +2186,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; @@ -2212,7 +2231,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -2271,7 +2290,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } @@ -2288,6 +2306,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->stripe_root = root; + } + } + return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2305,21 +2337,29 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; + if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { + if (!ignore_flags) { + btrfs_err(fs_info, + "unrecognized or unsupported super flag 0x%llx", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } else { + btrfs_info(fs_info, + "unrecognized or unsupported super flags: 0x%llx, ignored", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", @@ -2390,7 +2430,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { + if (!fs_info->fs_devices->temp_fsid && + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", sb->fsid, fs_info->fs_devices->fsid); @@ -2421,7 +2462,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, - "block-group-tree feature requires fres-space-tree and no-holes"); + "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } @@ -2542,7 +2583,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev struct btrfs_tree_parent_check check = { .level = level, .transid = gen, - .owner_root = root->root_key.objectid + .owner_root = btrfs_root_id(root) }; int ret = 0; @@ -2607,9 +2648,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) */ btrfs_set_super_log_root(sb, 0); - /* We can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; @@ -2643,7 +2681,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); - fs_info->last_trans_committed = fs_info->generation; + btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -2713,7 +2751,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->allocated_ebs); spin_lock_init(&fs_info->eb_leak_lock); #endif - extent_map_tree_init(&fs_info->mapping_tree); + fs_info->mapping_tree = RB_ROOT_CACHED; + rwlock_init(&fs_info->mapping_tree_lock); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2744,11 +2783,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->ordered_root_lock); btrfs_init_scrub(fs_info); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2786,6 +2823,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + /* Default compress algorithm when user does -o compress */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -2808,6 +2848,12 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); + if (ret) + return ret; + + spin_lock_init(&fs_info->extent_map_shrinker_lock); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -2832,6 +2878,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + if (btrfs_test_opt(fs_info, IGNOREMETACSUMS)) + set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info); } @@ -2877,22 +2925,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; + int ret = 0; while (1) { + unsigned int found; + spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + found = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) { + if (!found) { spin_unlock(&fs_info->fs_roots_radix_lock); break; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; + root_objectid = btrfs_root_id(gang[found - 1]) + 1; - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { /* Avoid to grab roots in dead_roots. */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; @@ -2903,35 +2951,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; + root_objectid = btrfs_root_id(gang[i]); + /* + * Continue to release the remaining roots after the first + * error without cleanup and preserve the first error + * for the return. + */ + if (!ret) + ret = btrfs_orphan_cleanup(gang[i]); btrfs_put_root(gang[i]); } + if (ret) + break; + root_objectid++; } -out: - /* Release the uncleaned roots due to error. */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; -} - -/* - * Some options only have meaning at mount time and shouldn't persist across - * remounts, or be displayed. Clear these at the end of mount and remount - * code paths. - */ -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) -{ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + return ret; } /* @@ -2946,7 +2984,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - rebuild_free_space_tree = true; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + btrfs_warn(fs_info, + "'clear_cache' option is ignored with extent tree v2"); + else + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); @@ -3160,14 +3202,12 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; u32 stripesize; u64 generation; - u64 features; u16 csum_type; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -3250,15 +3290,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - - features = btrfs_super_flags(disk_super); - if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { - features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; - btrfs_set_super_flags(disk_super, features); - btrfs_info(fs_info, - "found metadata UUID change in progress flag, clearing"); - } - memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -3279,13 +3310,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) WRITE_ONCE(fs_info->fs_error, -EUCLEAN); - /* - * In the long term, we'll store the compression type in the super - * block, and it'll be used for per file compression control. - */ - fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - - /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); @@ -3296,42 +3320,35 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; - ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) + /* + * Handle the space caching options appropriately now that we have the + * super block loaded and validated. + */ + btrfs_set_free_space_cache_settings(fs_info); + + if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) { + ret = -EINVAL; goto fail_alloc; + } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) goto fail_alloc; - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - - /* - * V1 space cache has some hardcoded PAGE_SIZE usage, and is - * going to be deprecated. - * - * Force to use v2 cache for subpage case. - */ - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(fs_info, FREE_SPACE_TREE, - "forcing free space tree for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); + /* + * At this point our mount options are validated, if we set ->max_inline + * to something non-standard make sure we truncate it to sectorsize. + */ + fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) @@ -3498,41 +3515,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_cleaner; } - if (!btrfs_test_opt(fs_info, NOSSD) && - !fs_info->fs_devices->rotating) { - btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); - } - - /* - * For devices supporting discard turn on discard=async automatically, - * unless it's already set or disabled. This could be turned off by - * nodiscard for the same mount. - * - * The zoned mode piggy backs on the discard functionality for - * resetting a zone. There is no reason to delay the zone reset as it is - * fast enough. So, do not enable async discard for zoned mode. - */ - if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC) || - btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable && - !btrfs_is_zoned(fs_info)) { - btrfs_set_and_info(fs_info, DISCARD_ASYNC, - "auto enabling async discard"); - } - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { - ret = btrfsic_mount(fs_info, fs_devices, - btrfs_test_opt(fs_info, - CHECK_INTEGRITY_DATA) ? 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - btrfs_warn(fs_info, - "failed to initialize integrity check module: %d", - ret); - } -#endif ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3558,7 +3540,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - goto clear_oneshot; + return 0; ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { @@ -3586,8 +3568,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); -clear_oneshot: - btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: @@ -3624,7 +3604,7 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); iput(fs_info->btree_inode); fail: @@ -3637,28 +3617,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if the primary sb fails. */ + if (bio->bi_opf & REQ_FUA) + atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, + &device->sb_write_errors); + else + atomic_inc(&device->sb_write_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); @@ -3670,7 +3647,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, struct btrfs_super_block *super; struct page *page; u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; int ret; bytenr_orig = btrfs_sb_offset(copy_num); @@ -3745,34 +3722,36 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * pages we use for writing are locked. + * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when page is not found or submission fails. + * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; - struct address_space *mapping = device->bdev->bd_inode->i_mapping; + struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_write_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3782,7 +3761,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); - errors++; + atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3795,20 +3774,20 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); - errors++; + atomic_inc(&device->sb_write_errors); continue; } + ASSERT(folio_order(folio) == 0); - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3822,8 +3801,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3832,22 +3810,20 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - - btrfsic_check_bio(bio); submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_write_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when page is not found or not marked up to - * date. + * Return -1 if primary super block write failed or when there were no super block + * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { @@ -3861,7 +3837,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3876,30 +3852,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, - bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + folio = filemap_get_folio(device->bdev->bd_mapping, + bytenr >> PAGE_SHIFT); + /* If the folio has been removed, then we know it completed. */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } + ASSERT(folio_order(folio) == 0); - /* Drop our reference */ - put_page(page); - - /* Drop the reference from the writing run */ - put_page(page); + /* Folio will be unlocked once the write completes. */ + folio_wait_locked(folio); + folio_put(folio); } - /* log error, force error return */ + errors += atomic_read(&device->sb_write_errors); + if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) + primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); @@ -3929,28 +3896,11 @@ static void write_dev_flush(struct btrfs_device *device) device->last_flush_error = BLK_STS_OK; -#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * When a disk has write caching disabled, we skip submission of a bio - * with flush and sync requests before writing the superblock, since - * it's not needed. However when the integrity checker is enabled, this - * results in reports that there are metadata blocks referred by a - * superblock that were not properly flushed. So don't skip the bio - * submission only when the integrity checker is enabled for the sake - * of simplicity, since this is a debug tool and not meant for use in - * non-debug builds. - */ - if (!bdev_write_cache(device->bdev)) - return; -#endif - bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - - btrfsic_check_bio(bio); submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4177,7 +4127,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); + (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); @@ -4196,9 +4146,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_commit_super(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_trans_handle *trans; - mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); @@ -4208,10 +4155,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) @@ -4220,9 +4164,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) struct btrfs_transaction *tmp; bool found = false; - if (list_empty(&fs_info->trans_list)) - return; - /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. @@ -4314,6 +4255,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4323,6 +4272,40 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); + + /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); /* * After we parked the cleaner kthread, ordered extents may have @@ -4352,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->extent_map_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4381,9 +4365,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4437,12 +4418,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) - btrfsic_unmount(fs_info->fs_devices); -#endif - - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); btrfs_close_devices(fs_info->fs_devices); } @@ -4464,22 +4440,13 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ ASSERT(trans->transid == fs_info->generation); btrfs_assert_tree_write_locked(buf); - if (transid != fs_info->generation) { - WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", - buf->start, transid, fs_info->generation); + if (unlikely(transid != fs_info->generation)) { btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu", + buf->start, transid, fs_info->generation); } set_extent_buffer_dirty(buf); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * btrfs_check_leaf() won't check item data if we don't have WRITTEN - * set, so this will only validate the basic structure of the items. - */ - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { - btrfs_print_leaf(buf); - ASSERT(0); - } -#endif } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, @@ -4520,10 +4487,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4547,7 +4510,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } @@ -4600,7 +4563,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -4631,6 +4594,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } if (head->must_insert_reserved) pin_bytes = true; @@ -4683,7 +4647,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4828,7 +4792,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4869,7 +4833,7 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } @@ -4898,14 +4862,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(fs_info); - btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_all_qgroup_pertrans(fs_info); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4958,6 +4918,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); + btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; @@ -4982,7 +4943,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; @@ -5006,7 +4974,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", - root->root_key.objectid); + btrfs_root_id(root)); ret = -ENOSPC; goto out; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index fca52385830c..127e31e08347 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,6 +6,22 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H +#include <linux/sizes.h> +#include <linux/compiler_types.h> +#include "ctree.h" +#include "fs.h" + +struct block_device; +struct super_block; +struct extent_buffer; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info; +struct btrfs_super_block; +struct btrfs_trans_handle; +struct btrfs_tree_parent_check; +struct btrfs_transaction; + #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -25,11 +41,7 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_tree_parent_check; - -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); @@ -37,18 +49,13 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, - struct extent_buffer *buf); -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num); +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); @@ -56,7 +63,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key); + const struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -74,7 +81,6 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); @@ -82,7 +88,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check); + const struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -109,7 +115,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, - struct btrfs_tree_parent_check *check); + const struct btrfs_tree_parent_check *check); blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 203e5964c9b0..e2b22bea348a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "export.h" #include "accessors.h" #include "super.h" @@ -35,15 +34,15 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; + fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); fid->gen = inode->i_generation; if (parent) { u64 parent_root_id; - fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_objectid = btrfs_ino(BTRFS_I(parent)); fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->root_key.objectid; + parent_root_id = btrfs_root_id(BTRFS_I(parent)->root); if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; @@ -85,7 +84,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(root)) return ERR_CAST(root); - inode = btrfs_iget(sb, objectid, root); + inode = btrfs_iget(objectid, root); btrfs_put_root(root); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -161,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) return ERR_PTR(-ENOMEM); if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; @@ -211,7 +210,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); + return d_obtain_alias(btrfs_iget(key.objectid, root)); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -222,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; @@ -244,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index eba6bc4f5a61..464582273af9 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -4,6 +4,10 @@ #define BTRFS_EXPORT_H #include <linux/exportfs.h> +#include <linux/types.h> + +struct dentry; +struct super_block; extern const struct export_operations btrfs_export_ops; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index ff8e117a1ace..5f9a43734812 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -4,9 +4,9 @@ #include <trace/events/btrfs.h> #include "messages.h" #include "ctree.h" +#include "extent_io.h" #include "extent-io-tree.h" #include "btrfs_inode.h" -#include "misc.h" static struct kmem_cache *extent_state_cache; @@ -48,6 +48,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) extent_state_in_tree(state), refcount_read(&state->refs)); list_del(&state->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_state_cache, state); } } @@ -58,12 +59,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct btrfs_inode *inode = tree->inode; + const struct btrfs_inode *inode; u64 isize; - if (!inode) + if (tree->owner != IO_TREE_INODE_IO) return; + inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -78,59 +80,82 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif + /* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. + * The only tree allowed to set the inode is IO_TREE_INODE_IO. */ -static struct lock_class_key file_extent_tree_class; +static bool is_inode_io_tree(const struct extent_io_tree *tree) +{ + return tree->owner == IO_TREE_INODE_IO; +} -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; +/* Return the inode if it's valid for the given tree, otherwise NULL. */ +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* Read-only access to the inode. */ +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* For read-only access to fs_info. */ +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode->root->fs_info; + return tree->fs_info; +} void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) { - tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->inode = NULL; + tree->fs_info = fs_info; tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); } +/* + * Empty an io tree, removing and freeing every extent state record from the + * tree. This should be called once we are sure no other task can access the + * tree anymore, so no tree updates happen after we empty the tree and there + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never + * set on any extent state when calling this function). + */ void extent_io_tree_release(struct extent_io_tree *tree) { + struct rb_root root; + struct extent_state *state; + struct extent_state *tmp; + spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); + root = tree->state; + tree->state = RB_ROOT; + rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { + /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. + * No need for a memory barrier here, as we are holding the tree + * lock and we only change the waitqueue while holding that lock + * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - cond_resched_lock(&tree->lock); } + /* + * Should still be empty even after a reschedule, no other task should + * be accessing the tree anymore. + */ + ASSERT(RB_EMPTY_ROOT(&tree->state)); spin_unlock(&tree->lock); } @@ -321,10 +346,46 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); + btrfs_panic(extent_io_tree_to_fs_info(tree), err, + "extent io tree error on %s state start %llu end %llu", + opname, state->start, state->end); +} + +static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *prev; + + prev = prev_state(state); + if (prev && prev->end == state->start - 1 && prev->state == state->state) { + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, prev); + state->start = prev->start; + rb_erase(&prev->rb_node, &tree->state); + RB_CLEAR_NODE(&prev->rb_node); + free_extent_state(prev); + } +} + +static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *next; + + next = next_state(state); + if (next && next->start == state->end + 1 && next->state == state->state) { + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, next); + state->end = next->end; + rb_erase(&next->rb_node, &tree->state); + RB_CLEAR_NODE(&next->rb_node); + free_extent_state(next); + } } /* @@ -338,31 +399,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - struct extent_state *other; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; - other = prev_state(state); - if (other && other->end == state->start - 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - other = next_state(state); - if (other && other->start == state->end + 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } + merge_prev_state(tree, state); + merge_next_state(tree, state); } static void set_state_bits(struct extent_io_tree *tree, @@ -372,8 +413,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_set_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -384,19 +425,27 @@ static void set_state_bits(struct extent_io_tree *tree, * Insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. + * Returns a pointer to the struct extent_state record containing the range + * requested for insertion, which may be the same as the given struct or it + * may be an existing record in the tree that was expanded to accommodate the + * requested range. In case of an extent_state different from the one that was + * given, the later can be freed or reused by the caller. + * + * On error it returns an error pointer. * * The tree lock is not taken internally. This is a utility function and * probably isn't what you want to call (see set/clear_extent_bit). */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) +static struct extent_state *insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, + struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent = NULL; - const u64 end = state->end; + const u64 start = state->start - 1; + const u64 end = state->end + 1; + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -407,23 +456,41 @@ static int insert_state(struct extent_io_tree *tree, parent = *node; entry = rb_entry(parent, struct extent_state, rb_node); - if (end < entry->start) { + if (state->end < entry->start) { + if (try_merge && end == entry->start && + state->state == entry->state) { + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); + entry->start = state->start; + merge_prev_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_left; - } else if (end > entry->end) { + } else if (state->end > entry->end) { + if (try_merge && entry->end == start && + state->state == entry->state) { + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); + entry->end = state->end; + merge_next_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_right; } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; + return ERR_PTR(-EEXIST); } } rb_link_node(&state->rb_node, parent, node); rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); - return 0; + return state; } /* @@ -460,8 +527,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->inode) - btrfs_split_delalloc_extent(tree->inode, orig, split); + if (is_inode_io_tree(tree)) + btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, + split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -508,8 +576,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_clear_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, + bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -547,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -578,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -650,7 +716,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -672,7 +738,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); if (wake) wake_up(&state->wq); @@ -708,26 +774,13 @@ out: } -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - /* * Wait for one or more bits to clear on a range in the state tree. * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -758,9 +811,15 @@ process_node: goto out; if (state->state & bits) { + DEFINE_WAIT(wait); + start = state->start; refcount_inc(&state->refs); - wait_on_state(tree, state); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); free_extent_state(state); goto again; } @@ -799,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -847,10 +905,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, if (state->end == start - 1 && extent_state_in_tree(state)) { while ((state = next_state(state)) != NULL) { if (state->state & bits) - goto got_it; + break; } + /* + * If we found the next extent state, clear cached_state + * so that we can cache the next extent state below and + * avoid future calls going over the same extent state + * again. If we haven't found any, clear as well since + * it's now useless. + */ free_extent_state(*cached_state); *cached_state = NULL; + if (state) + goto got_it; goto out; } free_extent_state(*cached_state); @@ -892,6 +959,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state *state; int ret = 1; + ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); if (state) { @@ -987,10 +1056,10 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1012,6 +1081,9 @@ again: */ prealloc = alloc_extent_state(mask); } + /* Optimistically preallocate the extent changeset ulist node. */ + if (changeset) + extent_changeset_prealloc(changeset, mask); spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -1050,7 +1122,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = state->start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1086,7 +1158,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1103,12 +1175,12 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, changeset); @@ -1133,6 +1205,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1148,12 +1222,15 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) - extent_io_tree_panic(tree, err); + inserted_state = insert_state(tree, prealloc, bits, changeset); + if (IS_ERR(inserted_state)) { + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); + } - cache_state(prealloc, cached_state); - prealloc = NULL; + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1167,16 +1244,19 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1198,7 +1278,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } @@ -1235,7 +1315,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; bool first_iteration = true; @@ -1274,7 +1354,7 @@ again: if (!state) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } prealloc->start = start; @@ -1325,14 +1405,14 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, NULL); @@ -1356,6 +1436,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1363,7 +1445,7 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -1373,11 +1455,15 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; + inserted_state = insert_state(tree, prealloc, bits, NULL); + if (IS_ERR(inserted_state)) { + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; + } + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1390,13 +1476,13 @@ hit_next: if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); @@ -1418,7 +1504,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } /* @@ -1640,15 +1726,46 @@ search: } /* - * Search a range in the state tree for a given mask. If 'filled' == 1, this - * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 - * is returned if any bit in the range is found set. + * Check if the single @bit exists in the given range. */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { struct extent_state *state = NULL; - int bitset = 0; + bool bitset = false; + + ASSERT(is_power_of_2(bit)); + + spin_lock(&tree->lock); + state = tree_search(tree, start); + while (state && start <= end) { + if (state->start > end) + break; + + if (state->state & bit) { + bitset = true; + break; + } + + /* If state->end is (u64)-1, start will overflow to 0 */ + start = state->end + 1; + if (start > end || start == 0) + break; + state = next_state(state); + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * Check if the whole range [@start,@end) contains the single @bit set. + */ +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) +{ + struct extent_state *state = NULL; + bool bitset = true; + + ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1657,35 +1774,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, else state = tree_search(tree, start); while (state && start <= end) { - if (filled && state->start > start) { - bitset = 0; + if (state->start > start) { + bitset = false; break; } if (state->start > end) break; - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; + if ((state->state & bit) == 0) { + bitset = false; break; } if (state->end == (u64)-1) break; + /* + * Last entry (if state->end is (u64)-1 and overflow happens), + * or next entry starts after the range. + */ start = state->end + 1; - if (start > end) + if (start > end || start == 0) break; state = next_state(state); } /* We ran out of states and were still inside of our range. */ - if (filled && !state) - bitset = 0; + if (!state) + bitset = false; spin_unlock(&tree->lock); return bitset; } @@ -1695,12 +1812,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1709,26 +1825,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1738,23 +1853,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } @@ -1770,8 +1884,8 @@ void __cold extent_state_free_cachep(void) int __init extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_state), 0, 0, + NULL); if (!extent_state_cache) return -ENOMEM; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 28c23a23d121..6ffef1cd37c1 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,15 +3,23 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/list.h> +#include <linux/wait.h> #include "misc.h" struct extent_changeset; +struct btrfs_fs_info; +struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -60,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -87,9 +97,17 @@ enum { struct extent_io_tree { struct rb_root state; - struct btrfs_fs_info *fs_info; - /* Inode associated with this tree, or NULL. */ - struct btrfs_inode *inode; + /* + * The fs_info is needed for trace points, a tree attached to an inode + * needs the inode. + * + * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be + * accessed as inode->root->fs_info + */ + union { + struct btrfs_fs_info *fs_info; + struct btrfs_inode *inode; + }; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -112,15 +130,29 @@ struct extent_state { #endif }; +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -131,8 +163,9 @@ u64 count_range_bits(struct extent_io_tree *tree, struct extent_state **cached_state); void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached_state); +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -192,7 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 021cf468274b..bb3602059906 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -18,7 +18,7 @@ #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" -#include "tree-log.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -26,14 +26,11 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" @@ -42,14 +39,14 @@ #include "file-item.h" #include "orphan.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -57,7 +54,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); + struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); @@ -100,18 +97,17 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; struct btrfs_key key; - u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -127,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; - } - search_again: key.objectid = bytenr; key.offset = offset; @@ -145,7 +136,7 @@ search_again: if (ret < 0) goto out_free; - if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -158,36 +149,37 @@ search_again: } if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_item *ei; + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(fs_info, ret, NULL); - + btrfs_abort_transaction(trans, ret); goto out_free; } - BUG_ON(num_refs == 0); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, ret); + goto out_free; + } + extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } - if (!trans) - goto out; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); @@ -210,20 +202,20 @@ search_again: spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); -out: + WARN_ON(num_refs == 0); if (refs) *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -344,9 +336,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -355,26 +353,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -385,7 +382,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, WARN_ON(1); btrfs_print_leaf(eb); - btrfs_err(eb->fs_info, + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); @@ -399,11 +396,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -438,9 +435,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; - int ret; int recow; - int err = -ENOENT; + int ret; key.objectid = bytenr; if (parent) { @@ -454,26 +450,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + if (ret < 0) + return ret; if (parent) { - if (!ret) - return 0; - goto fail; + if (ret) + return -ENOENT; + return 0; } + ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; + if (ret) { + if (ret > 0) + return -ENOENT; + return ret; + } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -494,37 +490,37 @@ again: btrfs_release_path(path); goto again; } - err = 0; + ret = 0; break; } path->slots[0]++; } fail: - return err; + return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; + key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } @@ -533,15 +529,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, goto fail; leaf = path->nodes[0]; - if (parent) { + if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { @@ -549,7 +545,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, + if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); @@ -564,14 +560,13 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); + btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } @@ -695,20 +690,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; + key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; + key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -789,7 +784,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int type; int want; int ret; - int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; @@ -816,10 +810,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } /* * We may be a newly converted file system which still has the old fat @@ -846,7 +838,7 @@ again: } if (ret && !insert) { - err = -ENOENT; + ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { btrfs_print_leaf(path->nodes[0]); @@ -854,18 +846,18 @@ again: "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", bytenr, num_bytes, parent, root_objectid, owner, offset); - err = -EIO; + ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %llu expect >= %zu", item_size, sizeof(*ei)); - btrfs_abort_transaction(trans, err); + btrfs_abort_transaction(trans, ret); goto out; } @@ -885,22 +877,17 @@ again: else needed = BTRFS_REF_TYPE_BLOCK; - err = -ENOENT; - while (1) { - if (ptr >= end) { - if (ptr > end) { - err = -EUCLEAN; - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, -"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", - path->slots[0], root_objectid, owner, offset, parent); - } - break; - } + ret = -ENOENT; + while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; + ret = -EUCLEAN; goto out; } @@ -916,7 +903,7 @@ again: dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { - err = 0; + ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < @@ -927,14 +914,14 @@ again: ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < root_objectid) @@ -943,10 +930,20 @@ again: } ptr += btrfs_extent_inline_ref_size(type); } - if (err == -ENOENT && insert) { + + if (unlikely(ptr > end)) { + ret = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + goto out; + } + + if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -958,7 +955,7 @@ again: if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } } @@ -969,7 +966,7 @@ out: path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } - return err; + return ret; } /* @@ -1443,7 +1440,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1456,7 +1453,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } /* - * __btrfs_inc_extent_ref - insert backreference for a given extent + * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. @@ -1466,36 +1463,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * - * @refs_to_add Number of references to add - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1504,7 +1477,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u64 refs; + int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); @@ -1513,7 +1489,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, + node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; @@ -1536,12 +1512,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); + ret = insert_tree_block_ref(trans, path, node, bytenr); else - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); + ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); @@ -1550,45 +1523,68 @@ out: return ret; } +static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *href) +{ + u64 root = href->owning_root; + + /* + * Don't check must_insert_reserved, as this is called from contexts + * where it has already been unset. + */ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || + !href->is_data || !is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, + BTRFS_QGROUP_RSV_DATA); +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; - struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; u64 parent = 0; - u64 ref_root = 0; u64 flags = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_key key; + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = node->num_bytes, + .is_data = true, + .is_inc = true, + .generation = trans->transid, + }; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); + if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, - flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + ret = alloc_reserved_file_extent(trans, parent, node->ref_root, + flags, owner, offset, &key, + node->ref_mod, + href->owning_root); + free_head_ref_squota_rsv(trans->fs_info, href); + if (!ret) + ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->objectid, ref->offset, - node->ref_mod, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1625,7 +1621,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 item_size; int ret; - int err = 0; int metadata = 1; if (TRANS_ABORTED(trans)) @@ -1642,7 +1637,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = extent_op->level; + key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; @@ -1652,10 +1647,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - err = ret; goto out; - } - if (ret > 0) { + } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; @@ -1676,10 +1669,10 @@ again: goto again; } } else { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", - head->bytenr, head->num_bytes, extent_op->level); + head->bytenr, head->num_bytes, head->level); goto out; } } @@ -1688,11 +1681,11 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - btrfs_abort_transaction(trans, err); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1702,25 +1695,25 @@ again: btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; - struct btrfs_delayed_tree_ref *ref; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 parent = 0; u64 ref_root = 0; - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; + ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, @@ -1730,14 +1723,21 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags); + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = fs_info->nodesize, + .is_data = false, + .is_inc = true, + .generation = trans->transid, + }; + ret = alloc_reserved_tree_block(trans, node, extent_op); + if (!ret) + btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1746,6 +1746,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) @@ -1753,19 +1754,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, int ret = 0; if (TRANS_ABORTED(trans)) { - if (insert_reserved) + if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + free_head_ref_squota_rsv(trans->fs_info, href); + } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, + ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, + ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) @@ -1844,28 +1849,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - int nr_items = 1; /* Dropping this ref head update. */ + u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { + int nr_csums; + spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + + btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); + + ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } + /* must_insert_reserved can be set only if we didn't run the head ref. */ + if (head->must_insert_reserved) + free_head_ref_squota_rsv(fs_info, head); - btrfs_delayed_refs_rsv_release(fs_info, nr_items); + return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) + struct btrfs_delayed_ref_head *head, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1910,7 +1925,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -1952,7 +1967,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref) + struct btrfs_delayed_ref_head *locked_ref, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -2000,14 +2016,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; + /* + * Unsetting this on the head ref relinquishes ownership of + * the rsv_bytes, so it is critical that every possible code + * path from here forward frees all reserves including qgroup + * reserve. + */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, ref, extent_op, + ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); + *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { @@ -2031,15 +2055,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) + u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; + unsigned long max_count = 0; + u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; + if (min_bytes == 0) { + max_count = delayed_refs->num_heads_ready; + min_bytes = U64_MAX; + } + do { if (!locked_ref) { locked_ref = btrfs_obtain_ref_head(trans); @@ -2067,7 +2098,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2079,7 +2110,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * Success, perform the usual cleanup of a processed * head */ - ret = cleanup_ref_head(trans, locked_ref); + ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2096,7 +2127,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref = NULL; cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); + } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || + (max_count > 0 && count < max_count) || + locked_ref); return 0; } @@ -2145,24 +2178,25 @@ static u64 find_middle(struct rb_root *root) #endif /* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. + * Start processing the delayed reference count updates and extent insertions + * we have queued up so far. + * + * @trans: Transaction handle. + * @min_bytes: How many bytes of delayed references to process. After this + * many bytes we stop processing delayed references if there are + * any more. If 0 it means to run all existing delayed references, + * but not new ones added after running all existing ones. + * Use (u64)-1 (U64_MAX) to run all existing delayed references + * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; int ret; - int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) @@ -2172,42 +2206,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = delayed_refs->num_heads_ready; - again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - ret = __btrfs_run_delayed_refs(trans, count); + ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } - if (run_all) { + if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { + if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { spin_unlock(&delayed_refs->lock); - goto out; + return 0; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } -out: + return 0; } @@ -2215,7 +2237,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; - int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -2225,9 +2246,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, + btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -2239,7 +2260,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; @@ -2293,6 +2313,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { + u64 ref_owner; + u64 ref_offset; + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -2300,15 +2323,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root, break; } - data_ref = btrfs_delayed_node_to_data_ref(ref); + ref_owner = btrfs_delayed_ref_owner(ref); + ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { + if (ref->ref_root != btrfs_root_id(root) || + ref_owner != objectid || ref_offset != offset) { ret = 1; break; } @@ -2332,6 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; @@ -2342,7 +2366,14 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = -ENOENT; if (path->slots[0] == 0) @@ -2358,10 +2389,22 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); + + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + goto out; + + /* Check for an owner ref; skip over it to the real inline refs. */ + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + } /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + if (item_size != expected_size) goto out; /* @@ -2373,8 +2416,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item))) goto out; - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) @@ -2383,8 +2424,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || + btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || btrfs_extent_data_ref_objectid(leaf, ref) != objectid || btrfs_extent_data_ref_offset(leaf, ref) != offset) goto out; @@ -2421,14 +2461,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; @@ -2455,6 +2492,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { + .action = action, + .parent = parent, + .ref_root = ref_root, + }; + if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -2464,34 +2507,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) + ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (ref.bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.owning_root = ref_root; + key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset, root->root_key.objectid, - for_reloc); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, - root->root_key.objectid, for_reloc); + /* We don't know the owning_root, leave as 0. */ + ref.bytenr = btrfs_node_blockptr(buf, i); + ref.num_bytes = fs_info->nodesize; + + btrfs_init_tree_ref(&ref, level - 1, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } @@ -2586,16 +2628,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes) + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; @@ -2607,10 +2646,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; @@ -2715,6 +2754,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; + int ret = 0; while (start <= end) { readonly = false; @@ -2724,7 +2764,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (cache == NULL) { + /* Logic error, something removed the block group. */ + ret = -EUCLEAN; + goto out; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2794,7 +2838,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache) btrfs_put_block_group(cache); - return 0; +out: + return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -2824,7 +2869,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); + ret = unpin_extent_range(fs_info, start, end, true); + BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); @@ -2851,7 +2897,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) block_group->length, &trimmed); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2866,12 +2920,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Parse an extent item's inline extents looking for a simple quotas owner ref. + * + * @fs_info: the btrfs_fs_info for this mount + * @leaf: a leaf in the extent tree containing the extent item + * @slot: the slot in the leaf where the extent item is found + * + * Returns the objectid of the root that originally allocated the extent item + * if the inline owner ref is expected and present, otherwise 0. + * + * If an extent item has an owner ref item, it will be the first inline ref + * item. Therefore the logic is to check whether there are any inline ref + * items, then check the type of the first one. + */ +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_owner_ref *oref; + unsigned long ptr; + unsigned long end; + int type; + + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) + return 0; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + btrfs_item_size(leaf, slot); + + /* No inline ref items of any kind, can't check type. */ + if (ptr == end) + return 0; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + + /* We found an owner ref, get the root out of it. */ + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + return btrfs_extent_owner_ref_root_id(leaf, oref); + } + + /* We have inline refs, but not an owner ref. */ + return 0; +} + static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool is_data) + u64 bytenr, struct btrfs_squota_delta *delta) { int ret; + u64 num_bytes = delta->num_bytes; - if (is_data) { + if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); @@ -2880,6 +2983,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } + + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = btrfs_record_squota_delta(trans->fs_info, delta); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); @@ -2962,9 +3077,8 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -2979,11 +3093,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner_objectid = btrfs_delayed_ref_owner(node); + u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); @@ -3007,7 +3125,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, + node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* @@ -3109,7 +3227,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", - bytenr, parent, root_objectid, owner_objectid, + bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { @@ -3184,6 +3302,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + struct btrfs_squota_delta delta = { + .root = delayed_ref_root, + .num_bytes = num_bytes, + .is_data = is_data, + .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), + }; + /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != @@ -3222,6 +3348,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } } + /* + * We can't infer the data owner from the delayed ref, so we need + * to try to get it from the owning ref item. + * + * If it is not present, then that extent was not written under + * simple quotas mode, so we don't need to account for its deletion. + */ + if (is_data) + delta.root = btrfs_get_extent_owner_root(trans->fs_info, + leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); @@ -3231,7 +3367,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); @@ -3301,82 +3437,92 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = buf->start, + .num_bytes = buf->len, + .parent = parent, + .owning_root = btrfs_header_owner(buf), + .ref_root = root_id, + }; + + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); if (ret < 0) return ret; } - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group *cache; - bool must_pin = false; - - if (root_id != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) { - btrfs_redirty_list_add(trans->transaction, buf); - goto out; - } - } + if (!last_ref) + return 0; - cache = btrfs_lookup_block_group(fs_info, buf->start); + if (btrfs_header_generation(buf) != trans->transid) + goto out; - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) goto out; - } + } - /* - * If there are tree mod log users we may have recorded mod log - * operations for this node. If we re-allocate this node we - * could replay operations on this node that happened when it - * existed in a completely different root. For example if it - * was part of root A, then was reallocated to root B, and we - * are doing a btrfs_old_search_slot(root b), we could replay - * operations that happened when the block was part of root A, - * giving us an inconsistent view of the btree. - * - * We are safe from races here because at this point no other - * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins we will not have an - * existing log of operations on this node that we have to - * contend with. - */ - if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) - must_pin = true; + bg = btrfs_lookup_block_group(fs_info, buf->start); - if (must_pin || btrfs_is_zoned(fs_info)) { - btrfs_redirty_list_add(trans->transaction, buf); - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; + } - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) + || btrfs_is_zoned(fs_info)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(bg, buf->start, buf->len); + btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_put_block_group(bg); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + out: - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - } + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } @@ -3393,12 +3539,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ - btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3406,10 +3548,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ret = btrfs_add_delayed_data_ref(trans, ref, 0); } - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -3515,6 +3654,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + /* * Helper function for find_free_extent(). * @@ -3536,7 +3690,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) + !block_group_bits(cluster_bg, ffe_ctl->flags) || + !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, @@ -4092,21 +4247,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } -static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group *bg) -{ - if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) - return true; - if (!btrfs_block_group_should_use_size_class(bg)) - return true; - if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) - return true; - if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && - bg->size_class == BTRFS_BG_SZ_NONE) - return true; - return ffe_ctl->size_class == bg->size_class; -} - static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4490,8 +4630,8 @@ loop: } /* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. + * Entry point to the extent allocator. Tries to find a hole that is at least + * as big as @num_bytes. * * @root - The root that will contain this extent * @@ -4544,7 +4684,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); @@ -4610,20 +4750,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(trans->fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", - start); + eb->start); return -ENOSPC; } - ret = pin_down_extent(trans, cache, start, len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } @@ -4653,24 +4793,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) + struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4692,7 +4837,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); @@ -4726,16 +4878,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 flags = extent_op->flags_to_set; + const u64 flags = (extent_op ? extent_op->flags_to_set : 0); + /* The owner of a tree block is the level. */ + int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - ref = btrfs_delayed_node_to_tree_ref(node); - extent_key.objectid = node->bytenr; if (skinny_metadata) { - extent_key.offset = ref->level; + /* The owner of a tree block is the level. */ + extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; @@ -4768,18 +4920,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); + btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } btrfs_mark_buffer_dirty(trans, leaf); @@ -4793,14 +4945,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins->objectid, + .num_bytes = ins->offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + generic_ref.owning_root = root->relocation_src_root; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, - offset, 0, false); + btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -4819,6 +4977,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; + struct btrfs_squota_delta delta = { + .root = root_objectid, + .num_bytes = ins->offset, + .generation = trans->transid, + .is_data = true, + .is_inc = true, + }; /* * Mixed block groups will exclude before processing the log so we only @@ -4844,13 +5009,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); + offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4864,15 +5052,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -4901,10 +5081,10 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - __btrfs_tree_lock(buf, nest); + btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); @@ -4916,7 +5096,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different @@ -4949,18 +5129,18 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { @@ -4987,42 +5167,54 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = PTR_ERR(buf); goto out_free_reserved; } + owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + owning_root = reloc_src_root; } else BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) { - ret = -ENOMEM; - goto out_free_buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .parent = parent, + .owning_root = owning_root, + .ref_root = root_objectid, + }; + + if (!skinny_metadata || flags != 0) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = (skinny_metadata ? false : true); + extent_op->update_flags = (flags != 0); + } else { + extent_op = NULL; } - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = skinny_metadata ? false : true; - extent_op->update_flags = true; - extent_op->level = level; - - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); - btrfs_init_tree_ref(&generic_ref, level, root_objectid, - root->root_key.objectid, false); + + btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); - if (ret) - goto out_free_delayed; + if (ret) { + btrfs_free_delayed_extent_op(extent_op); + goto out_free_buf; + } } return buf; -out_free_delayed: - btrfs_free_delayed_extent_op(extent_op); out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); @@ -5047,11 +5239,99 @@ struct walk_control { int reada_slot; int reada_count; int restarted; + /* Indicate that extent info needs to be looked up when walking the tree. */ + int lookup_info; }; +/* + * This is our normal stage. We are traversing blocks the current snapshot owns + * and we are dropping any of our references to any children we are able to, and + * then freeing the block once we've processed all of the children. + */ #define DROP_REFERENCE 1 + +/* + * We enter this stage when we have to walk into a child block (meaning we can't + * simply drop our reference to it from our current parent node) and there are + * more than one reference on it. If we are the owner of any of the children + * blocks from the current parent node then we have to do the FULL_BACKREF dance + * on them in order to drop our normal ref and add the shared ref. + */ #define UPDATE_BACKREF 2 +/* + * Decide if we need to walk down into this node to adjust the references. + * + * @root: the root we are currently deleting + * @wc: the walk control for this deletion + * @eb: the parent eb that we're currently visiting + * @refs: the number of refs for wc->level - 1 + * @flags: the flags for wc->level - 1 + * @slot: the slot in the eb that we're currently checking + * + * This is meant to be called when we're evaluating if a node we point to at + * wc->level should be read and walked into, or if we can simply delete our + * reference to it. We return true if we should walk into the node, false if we + * can skip it. + * + * We have assertions in here to make sure this is called correctly. We assume + * that sanity checking on the blocks read to this point has been done, so any + * corrupted file systems must have been caught before calling this function. + */ +static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, + struct extent_buffer *eb, u64 refs, u64 flags, int slot) +{ + struct btrfs_key key; + u64 generation; + int level = wc->level; + + ASSERT(level > 0); + ASSERT(wc->refs[level - 1] > 0); + + /* + * The update backref stage we only want to skip if we already have + * FULL_BACKREF set, otherwise we need to read. + */ + if (wc->stage == UPDATE_BACKREF) { + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + return true; + } + + /* + * We're the last ref on this block, we must walk into it and process + * any refs it's pointing at. + */ + if (wc->refs[level - 1] == 1) + return true; + + /* + * If we're already FULL_BACKREF then we know we can just drop our + * current reference. + */ + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + + /* + * This block is older than our creation generation, we can drop our + * reference to it. + */ + generation = btrfs_node_ptr_generation(eb, slot); + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) + return false; + + /* + * This block was processed from a previous snapshot deletion run, we + * can skip it. + */ + btrfs_node_key_to_cpu(eb, &key, slot); + if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) + return false; + + /* All other cases we need to wander into the node. */ + return true; +} + static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, @@ -5063,7 +5343,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 refs; u64 flags; u32 nritems; - struct btrfs_key key; struct extent_buffer *eb; int ret; int slot; @@ -5093,13 +5372,13 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5113,26 +5392,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, if (refs == 0) continue; - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; - - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - } + /* If we don't need to visit this node don't reada. */ + if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) + continue; reada: btrfs_readahead_node_child(eb, slot); nread++; @@ -5151,7 +5413,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; @@ -5159,22 +5421,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) + if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if (lookup_info && + if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret) return ret; if (unlikely(wc->refs[level] == 0)) { @@ -5199,11 +5461,20 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } wc->flags[level] |= flag; } @@ -5226,23 +5497,188 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; struct btrfs_path *path; struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); + btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + btrfs_free_path(path); + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); btrfs_free_path(path); - if (ret == -ENOENT) + return exists ? 1 : 0; +} + +/* + * We may not have an uptodate block, so if we are going to walk down into this + * block we need to drop the lock, read it off of the disk, re-lock it and + * return to continue dropping the snapshot. + */ +static int check_next_block_uptodate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, + struct extent_buffer *next) +{ + struct btrfs_tree_parent_check check = { 0 }; + u64 generation; + int level = wc->level; + int ret; + + btrfs_assert_tree_write_locked(next); + + generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); + + if (btrfs_buffer_uptodate(next, generation, 0)) return 0; - if (ret < 0) + + check.level = level - 1; + check.transid = generation; + check.owner_root = btrfs_root_id(root); + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); + + btrfs_tree_unlock(next); + if (level == 1) + reada_walk_down(trans, root, wc, path); + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); return ret; - return 1; + } + btrfs_tree_lock(next); + wc->lookup_info = 1; + return 0; +} + +/* + * If we determine that we don't have to visit wc->level - 1 then we need to + * determine if we can drop our reference. + * + * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. + * + * If we are DROP_REFERENCE this will figure out if we need to drop our current + * reference, skipping it if we dropped it from a previous incompleted drop, or + * dropping it if we still have a reference to it. + */ +static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, struct walk_control *wc, + struct extent_buffer *next, u64 owner_root) +{ + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = next->start, + .num_bytes = root->fs_info->nodesize, + .owning_root = owner_root, + .ref_root = btrfs_root_id(root), + }; + int level = wc->level; + int ret; + + /* We are UPDATE_BACKREF, we're not dropping anything. */ + if (wc->stage == UPDATE_BACKREF) + return 0; + + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + ref.parent = path->nodes[level]->start; + } else { + ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); + if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, "mismatched block owner"); + return -EIO; + } + } + + /* + * If we had a drop_progress we need to verify the refs are set as + * expected. If we find our ref then we know that from here on out + * everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, next->start, ref.parent, + level - 1); + if (ret <= 0) + return ret; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have already + * accounted them at merge time (replace_path), thus we could skip + * expensive subtree trace here. + */ + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + wc->refs[level - 1] > 1) { + u64 generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + + ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); + if (ret) { + btrfs_err_rl(root->fs_info, +"error %d accounting shared subtree, quota is out of sync, rescan required", + ret); + } + } + + /* + * We need to update the next key in our walk control so we can update + * the drop_progress key accordingly. We don't care if find_next_key + * doesn't find a key because that means we're at the end and are going + * to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + + btrfs_init_tree_ref(&ref, level - 1, 0, false); + return btrfs_free_extent(trans, &ref); } /* @@ -5261,20 +5697,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; - u64 parent; - struct btrfs_tree_parent_check check = { 0 }; - struct btrfs_key key; - struct btrfs_ref ref = { 0 }; + u64 owner_root = 0; struct extent_buffer *next; int level = wc->level; - int reada = 0; int ret = 0; - bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -5284,33 +5715,24 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; + generation <= btrfs_root_origin_generation(root)) { + wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - check.level = level - 1; - check.transid = generation; - check.owner_root = root->root_key.objectid; - check.has_first_key = true; - btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, - path->slots[level]); + next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), + level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); - next = find_extent_buffer(fs_info, bytenr); - if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr, - root->root_key.objectid, level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); - reada = 1; - } btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5320,53 +5742,27 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = -EUCLEAN; goto out_unlock; } - *lookup_info = 0; + wc->lookup_info = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - need_account = true; - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; + /* If we don't have to walk into this node skip it. */ + if (!visit_node_for_delete(root, wc, path->nodes[level], + wc->refs[level - 1], wc->flags[level - 1], + path->slots[level])) + goto skip; - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; - } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; + /* + * We have to walk down into this node, and if we're currently at the + * DROP_REFERNCE stage and this block is shared then we need to switch + * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. + */ + if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; } - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; - } - - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, &check); - if (IS_ERR(next)) { - return PTR_ERR(next); - } else if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - return -EIO; - } - btrfs_tree_lock(next); - } + ret = check_next_block_uptodate(trans, root, path, wc, next); + if (ret) + return ret; level--; ASSERT(level == btrfs_header_level(next)); @@ -5383,76 +5779,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->reada_slot = 0; return 0; skip: + ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); + if (ret) + goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - ASSERT(root->root_key.objectid == - btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level])) { - btrfs_err(root->fs_info, - "mismatched block owner"); - ret = -EIO; - goto out_unlock; - } - parent = 0; - } - - /* - * If we had a drop_progress we need to verify the refs are set - * as expected. If we find our ref then we know that from here - * on out everything should be correct, and we can clear the - * ->restarted flag. - */ - if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, - level - 1); - if (ret < 0) - goto out_unlock; - if (ret == 0) - goto no_delete; - ret = 0; - wc->restarted = 0; - } - - /* - * Reloc tree doesn't contribute to qgroup numbers, and we have - * already accounted them at merge time (replace_path), - * thus we could skip expensive subtree trace here. - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { - ret = btrfs_qgroup_trace_subtree(trans, next, - generation, level - 1); - if (ret) { - btrfs_err_rl(fs_info, - "Error %d accounting shared subtree. Quota is out of sync, rescan required.", - ret); - } - } - - /* - * We need to update the next key in our walk control so we can - * update the drop_progress key accordingly. We don't care if - * find_next_key doesn't find a key because that means we're at - * the end and are going to clean up now. - */ - wc->drop_level = level; - find_next_key(path, level, &wc->drop_progress); - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, - 0, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) - goto out_unlock; - } -no_delete: - *lookup_info = 1; + wc->lookup_info = 1; ret = 1; out_unlock: @@ -5486,7 +5818,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); + ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; @@ -5504,14 +5836,15 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, * count is one. */ if (!path->locks[level]) { - BUG_ON(level == 0); + ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5532,7 +5865,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { @@ -5540,8 +5873,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -5561,12 +5897,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) + else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != + else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } @@ -5582,21 +5918,42 @@ out: owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); + btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } +/* + * walk_down_tree consists of two steps. + * + * walk_down_proc(). Look up the reference count and reference of our current + * wc->level. At this point path->nodes[wc->level] should be populated and + * uptodate, and in most cases should already be locked. If we are in + * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and + * we can walk back up the tree. If we are UPDATE_BACKREF we have to set + * FULL_BACKREF on this node if it's not already set, and then do the + * FULL_BACKREF conversion dance, which is to drop the root reference and add + * the shared reference to all of this nodes children. + * + * do_walk_down(). This is where we actually start iterating on the children of + * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping + * our reference to the children that return false from visit_node_for_delete(), + * which has various conditions where we know we can just drop our reference + * without visiting the node. For UPDATE_BACKREF we will skip any children that + * visit_node_for_delete() returns false for, only walking down when necessary. + * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of + * snapshot deletion. + */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; - int lookup_info = 1; int ret = 0; + wc->lookup_info = 1; while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); + ret = walk_down_proc(trans, root, path, wc); if (ret) break; @@ -5607,7 +5964,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = do_walk_down(trans, root, path, wc, &lookup_info); + ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; @@ -5618,6 +5975,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, return (ret == 1) ? 0 : ret; } +/* + * walk_up_tree() is responsible for making sure we visit every slot on our + * current node, and if we're at the end of that node then we call + * walk_up_proc() on our current node which will do one of a few things based on + * our stage. + * + * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level + * then we need to walk back up the tree, and then going back down into the + * other slots via walk_down_tree to update any other children from our original + * wc->shared_level. Once we're at or above our wc->shared_level we can switch + * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. + * + * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. + * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents + * in our current leaf. After that we call btrfs_free_tree_block() on the + * current node and walk up to the next node to walk down the next slot. + */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -5668,8 +6042,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { - const bool is_reloc_root = (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID); + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5677,24 +6050,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) struct btrfs_root_item *root_item = &root->root_item; struct walk_control *wc; struct btrfs_key key; - int err = 0; - int ret; + const u64 rootid = btrfs_root_id(root); + int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -5707,12 +6080,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } - err = btrfs_run_delayed_items(trans); - if (err) + ret = btrfs_run_delayed_items(trans); + if (ret) goto out_end_trans; /* @@ -5743,11 +6116,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + WARN_ON(ret > 0); + ret = 0; /* * unlock our path, this is safe because only this @@ -5760,14 +6133,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; + /* + * btrfs_lookup_extent_info() returns 0 for success, + * or < 0 for error. + */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - err = ret; + &wc->flags[level], NULL); + if (ret < 0) goto out_end_trans; - } + BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) @@ -5793,19 +6169,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); + ret = 0; break; } @@ -5827,7 +6202,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) root_item); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -5838,7 +6212,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); - err = -EAGAIN; + ret = -EAGAIN; goto out_free; } @@ -5852,19 +6226,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); - if (err) + if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -5873,16 +6246,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } else if (ret > 0) { - /* if we fail to delete the orphan item this time + ret = 0; + /* + * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); + btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } @@ -5908,11 +6281,19 @@ out_free: kfree(wc); btrfs_free_path(path); out: + if (!ret && root_dropped) { + ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); + if (ret < 0) + btrfs_warn_rl(fs_info, + "failed to cleanup qgroup 0/%llu: %d", + rootid, ret); + ret = 0; + } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ - if (!err && unfinished_drop) + if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* @@ -5924,7 +6305,7 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - return err; + return ret; } /* @@ -5944,9 +6325,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, int level; int parent_level; int ret = 0; - int wret; - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -5980,17 +6360,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) break; - } - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) + ret = walk_up_tree(trans, root, path, wc, parent_level); + if (ret) { + if (ret > 0) + ret = 0; break; + } } kfree(wc); @@ -5998,10 +6377,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ef1c1c99294e..2ad51130c037 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,10 +3,21 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include <linux/types.h> #include "misc.h" #include "block-group.h" +#include "locking.h" +struct extent_buffer; struct btrfs_free_cluster; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_path; +struct btrfs_ref; +struct btrfs_disk_key; +struct btrfs_delayed_ref_head; +struct btrfs_delayed_ref_root; +struct btrfs_extent_inline_ref; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, @@ -91,18 +102,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes); +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); + const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict, @@ -113,6 +125,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest); int btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, @@ -136,12 +149,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2ae50dcca0f..d322cf82783f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,16 +14,13 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/fsverity.h> -#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" #include "bio.h" -#include "check-integrity.h" #include "locking.h" -#include "rcu-string.h" #include "backref.h" #include "disk-io.h" #include "subpage.h" @@ -79,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_buffer_cache, eb); } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); @@ -103,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -119,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -148,8 +153,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", - sizeof(struct extent_buffer), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_buffer), 0, 0, + NULL); if (!extent_buffer_cache) return -ENOMEM; @@ -166,24 +171,9 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - clear_page_dirty_for_io(page); - put_page(page); - index++; - } -} - -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { u32 len; @@ -191,23 +181,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info, len = end + 1 - start; if (page_ops & PAGE_SET_ORDERED) - btrfs_page_clamp_set_ordered(fs_info, page, start, len); + btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); if (page_ops & PAGE_START_WRITEBACK) { - btrfs_page_clamp_clear_dirty(fs_info, page, start, len); - btrfs_page_clamp_set_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); } if (page_ops & PAGE_END_WRITEBACK) - btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) - btrfs_page_end_writer_lock(fs_info, page, start, len); + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) + btrfs_folio_end_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; @@ -223,35 +213,34 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } } -static noinline void __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, +static noinline void __unlock_for_delalloc(const struct inode *inode, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; @@ -259,7 +248,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -272,23 +261,24 @@ static noinline int lock_delalloc_pages(struct inode *inode, goto out; for (i = 0; i < found_folios; i++) { - struct page *page = &fbatch.folios[i]->page; - u32 len = end + 1 - start; + struct folio *folio = fbatch.folios[i]; + u64 range_start; + u32 range_len; - if (page == locked_page) + if (folio == locked_folio) continue; - if (btrfs_page_start_writer_lock(fs_info, page, start, - len)) - goto out; - - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, - len); + folio_lock(folio); + if (!folio_test_dirty(folio) || folio->mapping != mapping) { + folio_unlock(folio); goto out; } + range_start = max_t(u64, folio_pos(folio), start); + range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + end + 1) - range_start; + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = range_start + range_len - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -298,7 +288,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + __unlock_for_delalloc(inode, locked_folio, start, + processed_end); return -EAGAIN; } @@ -319,10 +310,10 @@ out: */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; @@ -338,9 +329,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -357,25 +348,25 @@ again: } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, - delalloc_start, delalloc_end); + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -395,16 +386,15 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1, cached_state); + EXTENT_DELALLOC, cached_state); + + unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, - &cached_state); - __unlock_for_delalloc(inode, locked_page, - delalloc_start, delalloc_end); + __unlock_for_delalloc(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } - free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -412,280 +402,236 @@ out_failed: } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - struct page *locked_page, + const struct folio *locked_folio, + struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) - btrfs_page_set_uptodate(fs_info, page, start, len); + if (uptodate && btrfs_verify_folio(folio, start, len)) + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, page, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO + * After a write IO is done, we need to: + * + * - clear the uptodate bits on error + * - clear the writeback bits in the extent tree for the range + * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct btrfs_bio *bbio) +static void end_bbio_data_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; - u64 start = page_offset(page) + bvec->bv_offset; - u32 len = bvec->bv_len; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + u64 start = folio_pos(folio) + fi.offset; + u32 len = fi.length; + + /* Only order 0 (single page) folios are allowed for data. */ + ASSERT(folio_order(folio) == 0); /* Our read/write should always be sector aligned. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + "partial page write in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page write with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page write with offset %zu and length %zu", + fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) - mapping_set_error(page->mapping, error); - btrfs_page_clear_writeback(fs_info, page, start, len); + mapping_set_error(folio->mapping, error); + btrfs_folio_clear_writeback(fs_info, folio, start, len); } bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) -{ - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - ASSERT(PageLocked(page)); - if (!btrfs_is_subpage(fs_info, page)) + ASSERT(folio_test_locked(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page)); - btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(folio_test_private(folio)); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it + * After a data read IO is done, we need to: + * + * - clear the uptodate bits on error + * - set the uptodate bits if things worked + * - set the folio up to date if all extents in the tree are uptodate + * - clear the lock bit in the extent tree + * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct btrfs_bio *bbio) +static void end_bbio_data_read(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct bio_vec *bvec; - struct processed_extent processed = { 0 }; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; - struct bvec_iter_all iter_all; + struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; u64 start; u64 end; u32 len; btrfs_debug(fs_info, - "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - bio->bi_iter.bi_sector, bio->bi_status, + "%s: bi_sector=%llu, err=%d, mirror=%u", + __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); /* * We always issue full-sector reads, but if some block in a - * page fails to read, blk_update_request() will advance + * folio fails to read, blk_update_request() will advance * bv_offset and adjust bv_len to compensate. Print a warning * for unaligned offsets, and an error if they don't add up to * a full sector. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, - sectorsize)) + "partial page read in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page read with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page read with offset %zu and length %zu", + fi.offset, fi.length); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - len = bvec->bv_len; + start = folio_pos(folio) + fi.offset; + end = start + fi.length - 1; + len = fi.length; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; /* * Zero out the remaining part if this range straddles * i_size. * - * Here we should only zero the range inside the bvec, + * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (page->index == end_index && i_size <= end) { - u32 zero_start = max(offset_in_page(i_size), - offset_in_page(start)); - - zero_user_segment(page, zero_start, - offset_in_page(end) + 1); + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { + u32 zero_start = max(offset_in_folio(folio, i_size), + offset_in_folio(folio, start)); + u32 zero_len = offset_in_folio(folio, end) + 1 - + zero_start; + + folio_zero_range(folio, zero_start, zero_len); } } /* Update page status and unlock. */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - + end_folio_read(folio, uptodate, start, len); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } /* - * Populate every free slot in a provided array with pages. + * Populate every free slot in a provided array with folios using GFP_NOFS. + * + * @nr_folios: number of folios to allocate + * @folio_array: the array to fill with folios; any existing non-NULL entries in + * the array will be skipped + * + * Return: 0 if all folios were able to be allocated; + * -ENOMEM otherwise, the partially allocated folios would be freed and + * the array slots zeroed + */ +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +{ + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + continue; + folio_array[i] = folio_alloc(GFP_NOFS, 0); + if (!folio_array[i]) + goto error; + } + return 0; +error: + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + folio_put(folio_array[i]); + } + return -ENOMEM; +} + +/* + * Populate every free slot in a provided array with pages, using GFP_NOFS. * * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in - * the array will be skipped + * the array will be skipped + * @nofail: whether using __GFP_NOFAIL flag * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed */ -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + bool nofail) { + const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { @@ -698,13 +644,36 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } +/* + * Populate needed folios for the extent buffer. + * + * For now, the folios populated are always in order 0 (aka, single page). + */ +static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) +{ + struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; + int num_pages = num_extent_pages(eb); + int ret; + + ret = btrfs_alloc_page_array(num_pages, page_array, nofail); + if (ret < 0) + return ret; + + for (int i = 0; i < num_pages; i++) + eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; + return 0; +} + static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -717,7 +686,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -726,8 +695,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -780,17 +749,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -799,31 +768,32 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); - ASSERT(is_data_inode(&inode->vfs_inode)); + ASSERT(is_data_inode(inode)); len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -833,15 +803,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -852,9 +822,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, } while (size); } -static int attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page, - struct btrfs_subpage *prealloc) +static int attach_extent_buffer_folio(struct extent_buffer *eb, + struct folio *folio, + struct btrfs_subpage *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -865,74 +835,80 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs. */ - if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + if (folio->mapping) + lockdep_assert_held(&folio->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (!folio_test_private(folio)) + folio_attach_private(folio, eb); else - WARN_ON(page->private != (unsigned long)eb); + WARN_ON(folio_get_private(folio) != eb); return 0; } /* Already mapped, just free prealloc */ - if (PagePrivate(page)) { + if (folio_test_private(folio)) { btrfs_free_subpage(prealloc); return 0; } if (prealloc) /* Has preallocated memory for subpage */ - attach_page_private(page, prealloc); + folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, page, - BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } int set_page_extent_mapped(struct page *page) { + return set_folio_extent_mapped(page_folio(page)); +} + +int set_folio_extent_mapped(struct folio *folio) +{ struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); - if (PagePrivate(page)) + if (folio_test_private(folio)) return 0; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + if (btrfs_is_subpage(fs_info, folio->mapping)) + return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return; - fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_detach_subpage(fs_info, page); + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) + return btrfs_detach_subpage(fs_info, folio); - detach_page_private(page); + folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; - if (em_cached && *em_cached) { + ASSERT(em_cached); + + if (*em_cached) { em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { @@ -944,12 +920,13 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR(em)) { + em = btrfs_get_extent(inode, folio, start, len); + if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + return em; } /* @@ -959,12 +936,12 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = page_offset(page); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -975,25 +952,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = fs_info->sectorsize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } - bio_ctrl->end_io_func = end_bio_extent_readpage; - begin_page_read(fs_info, page); + bio_ctrl->end_io_func = end_bbio_data_read; + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1001,34 +976,30 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - compress_type = em->compress_type; + compress_type = extent_map_compression(em); iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) - disk_bytenr = em->block_start; + disk_bytenr = em->disk_bytenr; else - disk_bytenr = em->block_start + extent_offset; - block_start = em->block_start; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = extent_map_block_start(em); + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; /* @@ -1037,8 +1008,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1051,13 +1022,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1065,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent). */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + if (compress_type != BTRFS_COMPRESS_NONE && prev_em_start && *prev_em_start != (u64)-1 && *prev_em_start != em->start) force_bio_submit = true; @@ -1078,18 +1049,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1102,8 +1071,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1113,16 +1082,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + + free_extent_map(em_cached); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1131,60 +1104,222 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) { - struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); - int index; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; } /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * Do all of the delayed allocation setup. * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent). In this case the IO has - * been started and the page is already unlocked. + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked). * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. + * + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked. */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct btrfs_bio_ctrl *bio_ctrl) { - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); + struct writeback_control *wbc = bio_ctrl->wbc; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; + /* + * Save the last found delalloc end. As the delalloc end can go beyond + * page boundary, thus we cannot rely on subpage bitmap to locate the + * last delalloc end. + */ + u64 last_delalloc_end = 0; + /* + * The range end (exclusive) of the last successfully finished delalloc + * range. + * Any range covered by ordered extent must either be manually marked + * finished (error handling), or has IO submitted (and finish the + * ordered extent normally). + * + * This records the end of ordered extent cleanup if we hit an error. + */ + u64 last_finished_delalloc_end = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } - - ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, wbc); - if (ret < 0) - return ret; - + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); + last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } + delalloc_start = page_start; + if (!last_delalloc_end) + goto out; + + /* Run the delalloc ranges for the above locked ranges. */ + while (delalloc_start < page_end) { + u64 found_start; + u32 found_len; + bool found; + + if (!is_subpage) { + /* + * For non-subpage case, the found delalloc range must + * cover this folio and there must be only one locked + * delalloc range. + */ + found_start = page_start; + found_len = last_delalloc_end + 1 - found_start; + found = true; + } else { + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, + delalloc_start, &found_start, &found_len); + } + if (!found) + break; + /* + * The subpage range covers the last sector, the delalloc range may + * end beyond the folio boundary, use the saved delalloc_end + * instead. + */ + if (found_start + found_len >= page_end) + found_len = last_delalloc_end + 1 - found_start; + + if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean up this range during error + * handling. + */ + last_finished_delalloc_end = found_start; + /* No errors hit so far, run the current delalloc range. */ + ret = btrfs_run_delalloc_range(inode, folio, + found_start, + found_start + found_len - 1, + wbc); + if (ret >= 0) + last_finished_delalloc_end = found_start + found_len; + } else { + /* + * We've hit an error during previous delalloc range, + * have to cleanup the remaining locked ranges. + */ + unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); + __unlock_for_delalloc(&inode->vfs_inode, folio, + found_start, + found_start + found_len - 1); + } + + /* + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. + */ + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } + /* + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. + */ + if (found_start + found_len >= last_delalloc_end + 1) + break; + + delalloc_start = found_start + found_len; + } + /* + * It's possible we had some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished_delalloc_end - page_start) >> + fs_info->sectorsize_bits, + fs_info->sectors_per_page); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); + return ret; + } +out: + if (last_delalloc_end) + delalloc_end = last_delalloc_end; + else + delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE @@ -1193,10 +1328,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1214,178 +1349,165 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* - * Find the first byte we need to write. - * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. - * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; - /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. - */ - if (!btrfs_is_subpage(fs_info, page)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; - return; - } + ASSERT(IS_ALIGNED(filepos, sectorsize)); + + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); - range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR_OR_ZERO(em); - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + free_extent_map(em); + em = NULL; + + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * a folio for a range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); + + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = page_offset(page); - u64 end = cur + PAGE_SIZE - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + bool submitted_io = false; + bool error = false; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; - int nr = 0; - ret = btrfs_writepage_cow_fixup(page); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); + + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } - bio_ctrl->end_io_func = end_bio_extent_writepage; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); + + bio_ctrl->end_io_func = end_bbio_data_write; + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, page, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (unlikely(ret < 0)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; continue; } - - em = btrfs_get_extent(inode, NULL, 0, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); - goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = em->block_start; - disk_bytenr = em->block_start + extent_offset; - - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } - - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. - */ - btrfs_page_clear_dirty(fs_info, page, cur, iosize); - - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); - cur += iosize; - nr++; + submitted_io = true; } - btrfs_page_assert_not_dirty(fs_info, page); - *nr_ret = nr; - return 0; - -out_error: /* - * If we finish without problem, we should not only clear page dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - *nr_ret = nr; + if (!submitted_io && !error) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1398,60 +1520,60 @@ out_error: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; - int nr = 0; size_t pg_offset; - loff_t i_size = i_size_read(inode); + loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); + ret = extent_writepage_io(inode, folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); - } - unlock_page(page); + if (ret < 0) + mapping_set_error(folio->mapping, ret); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -1537,7 +1659,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not - * able to find the pages tagged with SetPageError at transaction + * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage @@ -1583,7 +1705,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * context. */ static struct extent_buffer *find_extent_buffer_nolock( - struct btrfs_fs_info *fs_info, u64 start) + const struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; @@ -1598,24 +1720,23 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -static void extent_buffer_write_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; if (!uptodate) set_btree_ioerr(eb); - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + struct folio *folio = fi.folio; + u32 len = fi.length; - btrfs_page_clear_writeback(fs_info, page, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); bio_offset += len; } @@ -1664,39 +1785,46 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, extent_buffer_write_end_io, eb); + eb->fs_info, end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; if (fs_info->nodesize < PAGE_SIZE) { - struct page *p = eb->pages[0]; + struct folio *folio = eb->folios[0]; + bool ret; - lock_page(p); - btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + folio_lock(folio); + btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len)) { - clear_page_dirty_for_io(p); + folio_clear_dirty_for_io(folio); wbc->nr_to_write--; } - __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); - wbc_account_cgroup_owner(wbc, p, eb->len); - unlock_page(p); + ret = bio_add_folio(&bbio->bio, folio, eb->len, + eb->start - folio_pos(folio)); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio, eb->len); + folio_unlock(folio); } else { - for (int i = 0; i < num_extent_pages(eb); i++) { - struct page *p = eb->pages[i]; - - lock_page(p); - clear_page_dirty_for_io(p); - set_page_writeback(p); - __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); - wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); - wbc->nr_to_write--; - unlock_page(p); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + bool ret; + + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + wbc->nr_to_write -= folio_nr_pages(folio); + folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -1713,17 +1841,17 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + while (bit_start < fs_info->sectors_per_page) { + struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; u64 start; @@ -1732,21 +1860,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); - bit_start++; + spin_unlock(&folio->mapping->i_private_lock); + bit_start += sectors_per_node; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1755,7 +1883,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1794,42 +1922,42 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return submit_eb_subpage(folio, wbc); - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&mapping->i_private_lock); return 0; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); /* * Shouldn't happen and normally this would be a BUG_ON but no point * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -1862,7 +1990,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct btrfs_eb_write_context ctx = { .wbc = wbc }; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -1903,7 +2031,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -1957,7 +2085,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) @@ -2096,7 +2224,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2143,14 +2271,14 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { bool found_error = false; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u32 sectorsize = fs_info->sectorsize; loff_t i_size = i_size_read(inode); u64 cur = start; @@ -2167,42 +2295,50 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; - int nr = 0; + struct folio *folio; + + folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); + + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; + } - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) - ASSERT(PageDirty(page)); + ASSERT(folio_test_locked(folio)); + if (pages_dirty && folio != locked_folio) + ASSERT(folio_test_dirty(folio)); - ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, - i_size, &nr); + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - set_page_writeback(page); - end_page_writeback(page); - } - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, - cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); - } - btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + if (ret) + mapping_set_error(mapping, ret); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); } -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; int ret = 0; @@ -2222,21 +2358,23 @@ int extent_writepages(struct address_space *mapping, return ret; } -void extent_readahead(struct readahead_control *rac) +void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + + unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) free_extent_map(em_cached); @@ -2254,7 +2392,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); u64 end = start + folio_size(folio) - 1; - size_t blocksize = btrfs_sb(folio->mapping->host->i_sb)->sectorsize; + size_t blocksize = folio_to_fs_info(folio)->sectorsize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -2280,19 +2418,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) +static bool try_release_extent_state(struct extent_io_tree *tree, + struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - int ret = 1; + bool ret; - if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { - ret = 0; + if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { + ret = false; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED); + int ret2; /* * At this point we can safely clear everything except the @@ -2300,15 +2439,15 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. */ - if (ret < 0) - ret = 0; + if (ret2 < 0) + ret = false; else - ret = 1; + ret = true; } return ret; } @@ -2318,933 +2457,80 @@ static int try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - struct extent_map *em; - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); - struct extent_io_tree *tree = &btrfs_inode->io_tree; - struct extent_map_tree *map = &btrfs_inode->extent_tree; - - if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > SZ_16M) { - u64 len; - while (start <= end) { - struct btrfs_fs_info *fs_info; - u64 cur_gen; - - len = end - start + 1; - write_lock(&map->lock); - em = lookup_extent_mapping(map, start, len); - if (!em) { - write_unlock(&map->lock); - break; - } - if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || - em->start != start) { - write_unlock(&map->lock); - free_extent_map(em); - break; - } - if (test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) - goto next; - /* - * If it's not in the list of modified extents, used - * by a fast fsync, we can remove it. If it's being - * logged we can safely remove it since fsync took an - * extra reference on the em. - */ - if (list_empty(&em->list) || - test_bit(EXTENT_FLAG_LOGGING, &em->flags)) - goto remove_em; - /* - * If it's in the list of modified extents, remove it - * only if its generation is older then the current one, - * in which case we don't need it for a fast fsync. - * Otherwise don't remove it, we could be racing with an - * ongoing fast fsync that could miss the new extent. - */ - fs_info = btrfs_inode->root->fs_info; - spin_lock(&fs_info->trans_lock); - cur_gen = fs_info->generation; - spin_unlock(&fs_info->trans_lock); - if (em->generation >= cur_gen) - goto next; -remove_em: - /* - * We only remove extent maps that are not in the list of - * modified extents or that are in the list but with a - * generation lower then the current generation, so there - * is no need to set the full fsync flag on the inode (it - * hurts the fsync performance for workloads with a data - * size that exceeds or is close to the system's memory). - */ - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); -next: - start = extent_map_end(em); - write_unlock(&map->lock); - - /* once for us */ - free_extent_map(em); - - cond_resched(); /* Allow large-extent preemption. */ - } - } - return try_release_extent_state(tree, page, mask); -} - -struct btrfs_fiemap_entry { - u64 offset; - u64 phys; - u64 len; - u32 flags; -}; - -/* - * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file - * range from the inode's io tree, unlock the subvolume tree search path, flush - * the fiemap cache and relock the file range and research the subvolume tree. - * The value here is something negative that can't be confused with a valid - * errno value and different from 1 because that's also a return value from - * fiemap_fill_next_extent() and also it's often used to mean some btree search - * did not find a key, so make it some distinct negative value. - */ -#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) - -/* - * Used to: - * - * - Cache the next entry to be emitted to the fiemap buffer, so that we can - * merge extents that are contiguous and can be grouped as a single one; - * - * - Store extents ready to be written to the fiemap buffer in an intermediary - * buffer. This intermediary buffer is to ensure that in case the fiemap - * buffer is memory mapped to the fiemap target file, we don't deadlock - * during btrfs_page_mkwrite(). This is because during fiemap we are locking - * an extent range in order to prevent races with delalloc flushing and - * ordered extent completion, which is needed in order to reliably detect - * delalloc in holes and prealloc extents. And this can lead to a deadlock - * if the fiemap buffer is memory mapped to the file we are running fiemap - * against (a silly, useless in practice scenario, but possible) because - * btrfs_page_mkwrite() will try to lock the same extent range. - */ -struct fiemap_cache { - /* An array of ready fiemap entries. */ - struct btrfs_fiemap_entry *entries; - /* Number of entries in the entries array. */ - int entries_size; - /* Index of the next entry in the entries array to write to. */ - int entries_pos; - /* - * Once the entries array is full, this indicates what's the offset for - * the next file extent item we must search for in the inode's subvolume - * tree after unlocking the extent range in the inode's io tree and - * releasing the search path. - */ - u64 next_search_offset; - /* - * This matches struct fiemap_extent_info::fi_mapped_extents, we use it - * to count ourselves emitted extents and stop instead of relying on - * fiemap_fill_next_extent() because we buffer ready fiemap entries at - * the @entries array, and we want to stop as soon as we hit the max - * amount of extents to map, not just to save time but also to make the - * logic at extent_fiemap() simpler. - */ - unsigned int extents_mapped; - /* Fields for the cached extent (unsubmitted, not ready, extent). */ - u64 offset; - u64 phys; - u64 len; - u32 flags; - bool cached; -}; - -static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache) -{ - for (int i = 0; i < cache->entries_pos; i++) { - struct btrfs_fiemap_entry *entry = &cache->entries[i]; - int ret; - - ret = fiemap_fill_next_extent(fieinfo, entry->offset, - entry->phys, entry->len, - entry->flags); - /* - * Ignore 1 (reached max entries) because we keep track of that - * ourselves in emit_fiemap_extent(). - */ - if (ret < 0) - return ret; - } - cache->entries_pos = 0; - - return 0; -} - -/* - * Helper to submit fiemap extent. - * - * Will try to merge current fiemap extent specified by @offset, @phys, - * @len and @flags with cached one. - * And only when we fails to merge, cached one will be submitted as - * fiemap extent. - * - * Return value is the same as fiemap_fill_next_extent(). - */ -static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache, - u64 offset, u64 phys, u64 len, u32 flags) -{ - struct btrfs_fiemap_entry *entry; - u64 cache_end; - - /* Set at the end of extent_fiemap(). */ - ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); - - if (!cache->cached) - goto assign; - - /* - * When iterating the extents of the inode, at extent_fiemap(), we may - * find an extent that starts at an offset behind the end offset of the - * previous extent we processed. This happens if fiemap is called - * without FIEMAP_FLAG_SYNC and there are ordered extents completing - * after we had to unlock the file range, release the search path, emit - * the fiemap extents stored in the buffer (cache->entries array) and - * the lock the remainder of the range and re-search the btree. - * - * For example we are in leaf X processing its last item, which is the - * file extent item for file range [512K, 1M[, and after - * btrfs_next_leaf() releases the path, there's an ordered extent that - * completes for the file range [768K, 2M[, and that results in trimming - * the file extent item so that it now corresponds to the file range - * [512K, 768K[ and a new file extent item is inserted for the file - * range [768K, 2M[, which may end up as the last item of leaf X or as - * the first item of the next leaf - in either case btrfs_next_leaf() - * will leave us with a path pointing to the new extent item, for the - * file range [768K, 2M[, since that's the first key that follows the - * last one we processed. So in order not to report overlapping extents - * to user space, we trim the length of the previously cached extent and - * emit it. - * - * Upon calling btrfs_next_leaf() we may also find an extent with an - * offset smaller than or equals to cache->offset, and this happens - * when we had a hole or prealloc extent with several delalloc ranges in - * it, but after btrfs_next_leaf() released the path, delalloc was - * flushed and the resulting ordered extents were completed, so we can - * now have found a file extent item for an offset that is smaller than - * or equals to what we have in cache->offset. We deal with this as - * described below. - */ - cache_end = cache->offset + cache->len; - if (cache_end > offset) { - if (offset == cache->offset) { - /* - * We cached a dealloc range (found in the io tree) for - * a hole or prealloc extent and we have now found a - * file extent item for the same offset. What we have - * now is more recent and up to date, so discard what - * we had in the cache and use what we have just found. - */ - goto assign; - } else if (offset > cache->offset) { - /* - * The extent range we previously found ends after the - * offset of the file extent item we found and that - * offset falls somewhere in the middle of that previous - * extent range. So adjust the range we previously found - * to end at the offset of the file extent item we have - * just found, since this extent is more up to date. - * Emit that adjusted range and cache the file extent - * item we have just found. This corresponds to the case - * where a previously found file extent item was split - * due to an ordered extent completing. - */ - cache->len = offset - cache->offset; - goto emit; - } else { - const u64 range_end = offset + len; - - /* - * The offset of the file extent item we have just found - * is behind the cached offset. This means we were - * processing a hole or prealloc extent for which we - * have found delalloc ranges (in the io tree), so what - * we have in the cache is the last delalloc range we - * found while the file extent item we found can be - * either for a whole delalloc range we previously - * emmitted or only a part of that range. - * - * We have two cases here: - * - * 1) The file extent item's range ends at or behind the - * cached extent's end. In this case just ignore the - * current file extent item because we don't want to - * overlap with previous ranges that may have been - * emmitted already; - * - * 2) The file extent item starts behind the currently - * cached extent but its end offset goes beyond the - * end offset of the cached extent. We don't want to - * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached - * extent and then partially store the current file - * extent item's range in the cache, for the subrange - * going the cached extent's end to the end of the - * file extent item. - */ - if (range_end <= cache_end) - return 0; - - if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) - phys += cache_end - offset; - - offset = cache_end; - len = range_end - cache_end; - goto emit; + struct btrfs_inode *inode = folio_to_inode(folio); + struct extent_io_tree *io_tree = &inode->io_tree; + + while (start <= end) { + const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); + const u64 len = end - start + 1; + struct extent_map_tree *extent_tree = &inode->extent_tree; + struct extent_map *em; + + write_lock(&extent_tree->lock); + em = lookup_extent_mapping(extent_tree, start, len); + if (!em) { + write_unlock(&extent_tree->lock); + break; } - } - - /* - * Only merges fiemap extents if - * 1) Their logical addresses are continuous - * - * 2) Their physical addresses are continuous - * So truly compressed (physical size smaller than logical size) - * extents won't get merged with each other - * - * 3) Share same flags - */ - if (cache->offset + cache->len == offset && - cache->phys + cache->len == phys && - cache->flags == flags) { - cache->len += len; - return 0; - } - -emit: - /* Not mergeable, need to submit cached one */ - - if (cache->entries_pos == cache->entries_size) { - /* - * We will need to research for the end offset of the last - * stored extent and not from the current offset, because after - * unlocking the range and releasing the path, if there's a hole - * between that end offset and this current offset, a new extent - * may have been inserted due to a new write, so we don't want - * to miss it. - */ - entry = &cache->entries[cache->entries_size - 1]; - cache->next_search_offset = entry->offset + entry->len; - cache->cached = false; - - return BTRFS_FIEMAP_FLUSH_CACHE; - } - - entry = &cache->entries[cache->entries_pos]; - entry->offset = cache->offset; - entry->phys = cache->phys; - entry->len = cache->len; - entry->flags = cache->flags; - cache->entries_pos++; - cache->extents_mapped++; - - if (cache->extents_mapped == fieinfo->fi_extents_max) { - cache->cached = false; - return 1; - } -assign: - cache->cached = true; - cache->offset = offset; - cache->phys = phys; - cache->len = len; - cache->flags = flags; - - return 0; -} - -/* - * Emit last fiemap cache - * - * The last fiemap cache may still be cached in the following case: - * 0 4k 8k - * |<- Fiemap range ->| - * |<------------ First extent ----------->| - * - * In this case, the first extent range will be cached but not emitted. - * So we must emit it before ending extent_fiemap(). - */ -static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache) -{ - int ret; - - if (!cache->cached) - return 0; - - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret > 0) - ret = 0; - return ret; -} - -static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) -{ - struct extent_buffer *clone; - struct btrfs_key key; - int slot; - int ret; - - path->slots[0]++; - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) - return 0; - - ret = btrfs_next_leaf(inode->root, path); - if (ret != 0) - return ret; - - /* - * Don't bother with cloning if there are no more file extent items for - * our inode. - */ - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; - - /* See the comment at fiemap_search_slot() about why we clone. */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; - - slot = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = clone; - path->slots[0] = slot; - - return 0; -} - -/* - * Search for the first file extent item that starts at a given file offset or - * the one that starts immediately before that offset. - * Returns: 0 on success, < 0 on error, 1 if not found. - */ -static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, - u64 file_offset) -{ - const u64 ino = btrfs_ino(inode); - struct btrfs_root *root = inode->root; - struct extent_buffer *clone; - struct btrfs_key key; - int slot; - int ret; - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = file_offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret != 0) - return ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; - } - - /* - * We clone the leaf and use it during fiemap. This is because while - * using the leaf we do expensive things like checking if an extent is - * shared, which can take a long time. In order to prevent blocking - * other tasks for too long, we use a clone of the leaf. We have locked - * the file range in the inode's io tree, so we know none of our file - * extent items can change. This way we avoid blocking other tasks that - * want to insert items for other inodes in the same leaf or b+tree - * rebalance operations (triggered for example when someone is trying - * to push items into this leaf when trying to insert an item in a - * neighbour leaf). - * We also need the private clone because holding a read lock on an - * extent buffer of the subvolume's b+tree will make lockdep unhappy - * when we check if extents are shared, as backref walking may need to - * lock the same leaf we are processing. - */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; - - slot = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = clone; - path->slots[0] = slot; - - return 0; -} - -/* - * Process a range which is a hole or a prealloc extent in the inode's subvolume - * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc - * extent. The end offset (@end) is inclusive. - */ -static int fiemap_process_hole(struct btrfs_inode *inode, - struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache, - struct extent_state **delalloc_cached_state, - struct btrfs_backref_share_check_ctx *backref_ctx, - u64 disk_bytenr, u64 extent_offset, - u64 extent_gen, - u64 start, u64 end) -{ - const u64 i_size = i_size_read(&inode->vfs_inode); - u64 cur_offset = start; - u64 last_delalloc_end = 0; - u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; - bool checked_extent_shared = false; - int ret; - - /* - * There can be no delalloc past i_size, so don't waste time looking for - * it beyond i_size. - */ - while (cur_offset < end && cur_offset < i_size) { - u64 delalloc_start; - u64 delalloc_end; - u64 prealloc_start; - u64 prealloc_len = 0; - bool delalloc; - - delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, - delalloc_cached_state, - &delalloc_start, - &delalloc_end); - if (!delalloc) + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { + write_unlock(&extent_tree->lock); + free_extent_map(em); break; - + } + if (test_range_bit_exists(io_tree, em->start, + extent_map_end(em) - 1, EXTENT_LOCKED)) + goto next; /* - * If this is a prealloc extent we have to report every section - * of it that has no delalloc. + * If it's not in the list of modified extents, used by a fast + * fsync, we can remove it. If it's being logged we can safely + * remove it since fsync took an extra reference on the em. */ - if (disk_bytenr != 0) { - if (last_delalloc_end == 0) { - prealloc_start = start; - prealloc_len = delalloc_start - start; - } else { - prealloc_start = last_delalloc_end + 1; - prealloc_len = delalloc_start - prealloc_start; - } - } - - if (prealloc_len > 0) { - if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - return ret; - else if (ret > 0) - prealloc_flags |= FIEMAP_EXTENT_SHARED; - - checked_extent_shared = true; - } - ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, - disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); - if (ret) - return ret; - extent_offset += prealloc_len; - } - - ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, - delalloc_end + 1 - delalloc_start, - FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - if (ret) - return ret; - - last_delalloc_end = delalloc_end; - cur_offset = delalloc_end + 1; - extent_offset += cur_offset - delalloc_start; - cond_resched(); - } - - /* - * Either we found no delalloc for the whole prealloc extent or we have - * a prealloc extent that spans i_size or starts at or after i_size. - */ - if (disk_bytenr != 0 && last_delalloc_end < end) { - u64 prealloc_start; - u64 prealloc_len; - - if (last_delalloc_end == 0) { - prealloc_start = start; - prealloc_len = end + 1 - start; - } else { - prealloc_start = last_delalloc_end + 1; - prealloc_len = end + 1 - prealloc_start; - } - - if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - return ret; - else if (ret > 0) - prealloc_flags |= FIEMAP_EXTENT_SHARED; - } - ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, - disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); - if (ret) - return ret; - } - - return 0; -} - -static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, - struct btrfs_path *path, - u64 *last_extent_end_ret) -{ - const u64 ino = btrfs_ino(inode); - struct btrfs_root *root = inode->root; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *ei; - struct btrfs_key key; - u64 disk_bytenr; - int ret; - - /* - * Lookup the last file extent. We're not using i_size here because - * there might be preallocation past i_size. - */ - ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); - /* There can't be a file extent item at offset (u64)-1 */ - ASSERT(ret != 0); - if (ret < 0) - return ret; - - /* - * For a non-existing key, btrfs_search_slot() always leaves us at a - * slot > 0, except if the btree is empty, which is impossible because - * at least it has the inode item for this inode and all the items for - * the root inode 256. - */ - ASSERT(path->slots[0] > 0); - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { - /* No file extent items in the subvolume tree. */ - *last_extent_end_ret = 0; - return 0; - } - - /* - * For an inline extent, the disk_bytenr is where inline data starts at, - * so first check if we have an inline extent item before checking if we - * have an implicit hole (disk_bytenr == 0). - */ - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { - *last_extent_end_ret = btrfs_file_extent_end(path); - return 0; - } - - /* - * Find the last file extent item that is not a hole (when NO_HOLES is - * not enabled). This should take at most 2 iterations in the worst - * case: we have one hole file extent item at slot 0 of a leaf and - * another hole file extent item as the last item in the previous leaf. - * This is because we merge file extent items that represent holes. - */ - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - while (disk_bytenr == 0) { - ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); - if (ret < 0) { - return ret; - } else if (ret > 0) { - /* No file extent items that are not holes. */ - *last_extent_end_ret = 0; - return 0; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - } - - *last_extent_end_ret = btrfs_file_extent_end(path); - return 0; -} - -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - const u64 ino = btrfs_ino(inode); - struct extent_state *cached_state = NULL; - struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; - struct fiemap_cache cache = { 0 }; - struct btrfs_backref_share_check_ctx *backref_ctx; - u64 last_extent_end; - u64 prev_extent_end; - u64 range_start; - u64 range_end; - const u64 sectorsize = inode->root->fs_info->sectorsize; - bool stopped = false; - int ret; - - cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); - cache.entries = kmalloc_array(cache.entries_size, - sizeof(struct btrfs_fiemap_entry), - GFP_KERNEL); - backref_ctx = btrfs_alloc_backref_share_check_ctx(); - path = btrfs_alloc_path(); - if (!cache.entries || !backref_ctx || !path) { - ret = -ENOMEM; - goto out; - } - -restart: - range_start = round_down(start, sectorsize); - range_end = round_up(start + len, sectorsize); - prev_extent_end = range_start; - - lock_extent(&inode->io_tree, range_start, range_end, &cached_state); - - ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); - if (ret < 0) - goto out_unlock; - btrfs_release_path(path); - - path->reada = READA_FORWARD; - ret = fiemap_search_slot(inode, path, range_start); - if (ret < 0) { - goto out_unlock; - } else if (ret > 0) { + if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) + goto remove_em; /* - * No file extent item found, but we may have delalloc between - * the current offset and i_size. So check for that. + * If it's in the list of modified extents, remove it only if + * its generation is older then the current one, in which case + * we don't need it for a fast fsync. Otherwise don't remove it, + * we could be racing with an ongoing fast fsync that could miss + * the new extent. */ - ret = 0; - goto check_eof_delalloc; - } - - while (prev_extent_end < range_end) { - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_file_extent_item *ei; - struct btrfs_key key; - u64 extent_end; - u64 extent_len; - u64 extent_offset = 0; - u64 extent_gen; - u64 disk_bytenr = 0; - u64 flags = 0; - int extent_type; - u8 compression; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) - break; - - extent_end = btrfs_file_extent_end(path); - + if (em->generation >= cur_gen) + goto next; +remove_em: /* - * The first iteration can leave us at an extent item that ends - * before our range's start. Move to the next item. + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there is no + * need to set the full fsync flag on the inode (it hurts the + * fsync performance for workloads with a data size that exceeds + * or is close to the system's memory). */ - if (extent_end <= range_start) - goto next_item; - - backref_ctx->curr_leaf_bytenr = leaf->start; - - /* We have in implicit hole (NO_HOLES feature enabled). */ - if (prev_extent_end < key.offset) { - const u64 hole_end = min(key.offset, range_end) - 1; + remove_extent_mapping(inode, em); + /* Once for the inode's extent map tree. */ + free_extent_map(em); +next: + start = extent_map_end(em); + write_unlock(&extent_tree->lock); - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, 0, 0, 0, - prev_extent_end, hole_end); - if (ret < 0) { - goto out_unlock; - } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ - stopped = true; - break; - } + /* Once for us, for the lookup_extent_mapping() reference. */ + free_extent_map(em); - /* We've reached the end of the fiemap range, stop. */ - if (key.offset >= range_end) { - stopped = true; + if (need_resched()) { + /* + * If we need to resched but we can't block just exit + * and leave any remaining extent maps. + */ + if (!gfpflags_allow_blocking(mask)) break; - } - } - - extent_len = extent_end - key.offset; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - compression = btrfs_file_extent_compression(leaf, ei); - extent_type = btrfs_file_extent_type(leaf, ei); - extent_gen = btrfs_file_extent_generation(leaf, ei); - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - if (compression == BTRFS_COMPRESS_NONE) - extent_offset = btrfs_file_extent_offset(leaf, ei); - } - - if (compression != BTRFS_COMPRESS_NONE) - flags |= FIEMAP_EXTENT_ENCODED; - - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - flags |= FIEMAP_EXTENT_DATA_INLINE; - flags |= FIEMAP_EXTENT_NOT_ALIGNED; - ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, - extent_len, flags); - } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, - disk_bytenr, extent_offset, - extent_gen, key.offset, - extent_end - 1); - } else if (disk_bytenr == 0) { - /* We have an explicit hole. */ - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, - backref_ctx, 0, 0, 0, - key.offset, extent_end - 1); - } else { - /* We have a regular extent. */ - if (fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, - backref_ctx); - if (ret < 0) - goto out_unlock; - else if (ret > 0) - flags |= FIEMAP_EXTENT_SHARED; - } - - ret = emit_fiemap_extent(fieinfo, &cache, key.offset, - disk_bytenr + extent_offset, - extent_len, flags); - } - - if (ret < 0) { - goto out_unlock; - } else if (ret > 0) { - /* emit_fiemap_extent() told us to stop. */ - stopped = true; - break; - } - - prev_extent_end = extent_end; -next_item: - if (fatal_signal_pending(current)) { - ret = -EINTR; - goto out_unlock; - } - ret = fiemap_next_leaf_item(inode, path); - if (ret < 0) { - goto out_unlock; - } else if (ret > 0) { - /* No more file extent items for this inode. */ - break; + cond_resched(); } - cond_resched(); } - -check_eof_delalloc: - if (!stopped && prev_extent_end < range_end) { - ret = fiemap_process_hole(inode, fieinfo, &cache, - &delalloc_cached_state, backref_ctx, - 0, 0, 0, prev_extent_end, range_end - 1); - if (ret < 0) - goto out_unlock; - prev_extent_end = range_end; - } - - if (cache.cached && cache.offset + cache.len >= last_extent_end) { - const u64 i_size = i_size_read(&inode->vfs_inode); - - if (prev_extent_end < i_size) { - u64 delalloc_start; - u64 delalloc_end; - bool delalloc; - - delalloc = btrfs_find_delalloc_in_range(inode, - prev_extent_end, - i_size - 1, - &delalloc_cached_state, - &delalloc_start, - &delalloc_end); - if (!delalloc) - cache.flags |= FIEMAP_EXTENT_LAST; - } else { - cache.flags |= FIEMAP_EXTENT_LAST; - } - } - -out_unlock: - unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); - - if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { - btrfs_release_path(path); - ret = flush_fiemap_cache(fieinfo, &cache); - if (ret) - goto out; - len -= cache.next_search_offset - start; - start = cache.next_search_offset; - goto restart; - } else if (ret < 0) { - goto out; - } - - /* - * Must free the path before emitting to the fiemap buffer because we - * may have a non-cloned leaf and if the fiemap buffer is memory mapped - * to a file, a write into it (through btrfs_page_mkwrite()) may trigger - * waiting for an ordered extent that in order to complete needs to - * modify that leaf, therefore leading to a deadlock. - */ - btrfs_free_path(path); - path = NULL; - - ret = flush_fiemap_cache(fieinfo, &cache); - if (ret) - goto out; - - ret = emit_last_fiemap_cache(fieinfo, &cache); -out: - free_extent_state(delalloc_cached_state); - kfree(cache.entries); - btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); - return ret; + return try_release_extent_state(io_tree, folio, mask); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -3258,41 +2544,35 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) +static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); - if (PagePrivate(page)) { - subpage = (struct btrfs_subpage *)page->private; + if (folio_test_private(folio)) { + subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } -static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* - * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * For mapped eb, we're going to change the folio private, which should + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } @@ -3301,66 +2581,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to + * only clear folio if it's still connected to * this eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { + if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - detach_page_private(page); + BUG_ON(folio_test_dirty(folio)); + BUG_ON(folio_test_writeback(folio)); + /* We need to make sure we haven't be attached to a new eb. */ + folio_detach_private(folio); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } /* - * For subpage, we can have dummy eb with page private. In this case, - * we can directly detach the private as such page is only attached to - * one dummy eb, no sharing. + * For subpage, we can have dummy eb with folio private attached. In + * this case, we can directly detach the private as such folio is only + * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, page); + btrfs_detach_subpage(fs_info, folio); return; } - btrfs_page_dec_eb_refs(fs_info, page); + btrfs_folio_dec_eb_refs(fs_info, folio); /* - * We can only detach the page private if there are no other ebs in the + * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!page_range_has_eb(fs_info, page)) - btrfs_detach_subpage(fs_info, page); + if (!folio_range_has_eb(fs_info, folio)) + btrfs_detach_subpage(fs_info, folio); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) { - int i; - int num_pages; - ASSERT(!extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { + struct folio *folio = eb->folios[i]; - if (!page) + if (!folio) continue; - detach_extent_buffer_page(eb, page); + detach_extent_buffer_folio(eb, folio); - /* One for when we allocated the page */ - put_page(page); + /* One for when we allocated the folio. */ + folio_put(folio); } } @@ -3398,9 +2670,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { - int i; struct extent_buffer *new; - int num_pages = num_extent_pages(src); + int num_folios = num_extent_folios(src); int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); @@ -3414,22 +2685,21 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = btrfs_alloc_page_array(num_pages, new->pages); + ret = alloc_eb_folio_array(new, false); if (ret) { btrfs_release_extent_buffer(new); return NULL; } - for (i = 0; i < num_pages; i++) { - int ret; - struct page *p = new->pages[i]; + for (int i = 0; i < num_folios; i++) { + struct folio *folio = new->folios[i]; - ret = attach_extent_buffer_page(new, p, NULL); + ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) { btrfs_release_extent_buffer(new); return NULL; } - WARN_ON(PageDirty(p)); + WARN_ON(folio_test_dirty(folio)); } copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); @@ -3441,23 +2711,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { struct extent_buffer *eb; - int num_pages; - int i; + int num_folios = 0; int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; - num_pages = num_extent_pages(eb); - ret = btrfs_alloc_page_array(num_pages, eb->pages); + ret = alloc_eb_folio_array(eb, false); if (ret) goto err; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = attach_extent_buffer_page(eb, p, NULL); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) goto err; } @@ -3468,10 +2735,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) { - detach_extent_buffer_page(eb, eb->pages[i]); - __free_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) { + if (eb->folios[i]) { + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); } } __free_extent_buffer(eb); @@ -3520,20 +2787,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb, - struct page *accessed) +static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_pages, i; + int num_folios= num_extent_folios(eb); check_buffer_tree_ref(eb); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (p != accessed) - mark_page_accessed(p); - } + for (int i = 0; i < num_folios; i++) + folio_mark_accessed(eb->folios[i]); } struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, @@ -3561,14 +2822,14 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, spin_lock(&eb->refs_lock); spin_unlock(&eb->refs_lock); } - mark_extent_buffer_accessed(eb, NULL); + mark_extent_buffer_accessed(eb); return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -3604,14 +2865,20 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&page->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3621,21 +2888,21 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* Page not yet attached to an extent buffer */ - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; /* * We could have already allocated an eb for this page and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can - * just overwrite page->private. + * just overwrite folio private. */ - exists = (struct extent_buffer *)page->private; + exists = folio_get_private(folio); if (atomic_inc_not_zero(&exists->refs)) return exists; WARN_ON(PageDirty(page)); - detach_page_private(page); + folio_detach_private(folio); return NULL; } @@ -3660,6 +2927,101 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) start, fs_info->nodesize); return -EINVAL; } + if (!IS_ALIGNED(start, fs_info->nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + btrfs_warn(fs_info, +"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", + start, fs_info->nodesize); + } + return 0; +} + + +/* + * Return 0 if eb->folios[i] is attached to btree inode successfully. + * Return >0 if there is already another extent buffer for the range, + * and @found_eb_ret would be updated. + * Return -EAGAIN if the filemap has an existing folio but with different size + * than @eb. + * The caller needs to free the existing folios and retry using the same order. + */ +static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, + struct extent_buffer **found_eb_ret) +{ + + struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + const unsigned long index = eb->start >> PAGE_SHIFT; + struct folio *existing_folio = NULL; + int ret; + + ASSERT(found_eb_ret); + + /* Caller should ensure the folio exists. */ + ASSERT(eb->folios[i]); + +retry: + ret = filemap_add_folio(mapping, eb->folios[i], index + i, + GFP_NOFS | __GFP_NOFAIL); + if (!ret) + goto finish; + + existing_folio = filemap_lock_folio(mapping, index + i); + /* The page cache only exists for a very short time, just retry. */ + if (IS_ERR(existing_folio)) { + existing_folio = NULL; + goto retry; + } + + /* For now, we should only have single-page folios for btree inode. */ + ASSERT(folio_nr_pages(existing_folio) == 1); + + if (folio_size(existing_folio) != eb->folio_size) { + folio_unlock(existing_folio); + folio_put(existing_folio); + return -EAGAIN; + } + +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } else if (existing_folio) { + struct extent_buffer *existing_eb; + + existing_eb = grab_extent_buffer(fs_info, + folio_page(existing_folio, 0)); + if (existing_eb) { + /* The extent buffer still exists, we can use it directly. */ + *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); + folio_unlock(existing_folio); + folio_put(existing_folio); + return 1; + } + /* The extent buffer no longer exists, we can reuse the folio. */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3667,15 +3029,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; - int num_pages; - int i; - unsigned long index = start >> PAGE_SHIFT; + int num_folios; + int attached = 0; struct extent_buffer *eb; - struct extent_buffer *exists = NULL; - struct page *p; - struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct extent_buffer *existing_eb = NULL; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; + bool page_contig = true; int uptodate = 1; int ret; @@ -3710,11 +3070,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); - num_pages = num_extent_pages(eb); - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * Preallocate folio private for subpage case, so that we won't + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3722,47 +3080,73 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { - exists = ERR_CAST(prealloc); - goto free_eb; + ret = PTR_ERR(prealloc); + goto out; } } - for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) { - exists = ERR_PTR(-ENOMEM); - btrfs_free_subpage(prealloc); - goto free_eb; - } +reallocate: + /* Allocate all pages first. */ + ret = alloc_eb_folio_array(eb, true); + if (ret < 0) { + btrfs_free_subpage(prealloc); + goto out; + } - spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(fs_info, p); - if (exists) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - btrfs_free_subpage(prealloc); - goto free_eb; + num_folios = num_extent_folios(eb); + /* Attach all pages to the filemap. */ + for (int i = 0; i < num_folios; i++) { + struct folio *folio; + + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); + if (ret > 0) { + ASSERT(existing_eb); + goto out; } - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_page(eb, p, prealloc); - ASSERT(!ret); + /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the page private - * when the eb hasn't yet been inserted into radix tree. + * TODO: Special handling for a corner case where the order of + * folios mismatch between the new eb and filemap. + * + * This happens when: * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. + * - the new eb is using higher order folio + * + * - the filemap is still using 0-order folios for the range + * This can happen at the previous eb allocation, and we don't + * have higher order folio for the call. + * + * - the existing eb has already been freed + * + * In this case, we have to free the existing folios first, and + * re-allocate using the same order. + * Thankfully this is not going to happen yet, as we're still + * using 0-order folios. + */ + if (unlikely(ret == -EAGAIN)) { + ASSERT(0); + goto reallocate; + } + attached++; + + /* + * Only after attach_eb_folio_to_filemap(), eb->folios[] is + * reliable, as we may choose to reuse the existing page cache + * and free the allocated page. + */ + folio = eb->folios[i]; + WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + + /* + * Check if the current page is physically contiguous with previous eb + * page. + * At this stage, either we allocated a large folio, thus @i + * would only be 0, or we fall back to per-page allocation. */ - btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) + page_contig = false; - WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); - eb->pages[i] = p; - if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) + if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) uptodate = 0; /* @@ -3775,12 +3159,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + /* All pages are physically contiguous, can skip cross page handling. */ + if (page_contig) + eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } + if (ret) + goto out; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -3788,9 +3173,10 @@ again: spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; + ret = 0; + existing_eb = find_extent_buffer(fs_info, start); + if (existing_eb) + goto out; else goto again; } @@ -3803,19 +3189,46 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (i = 0; i < num_pages; i++) - unlock_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) + unlock_page(folio_page(eb->folios[i], 0)); return eb; -free_eb: +out: WARN_ON(!atomic_dec_and_test(&eb->refs)); - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); + + /* + * Any attached folios need to be detached before we unlock them. This + * is because when we're inserting our new folios into the mapping, and + * then attaching our eb to that folio. If we fail to insert our folio + * we'll lookup the folio for that index, and grab that EB. We do not + * want that to grab this eb, as we're getting ready to free it. So we + * have to detach it first and then unlock it. + * + * We have to drop our reference and NULL it out here because in the + * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. + * Below when we call btrfs_release_extent_buffer() we will call + * detach_extent_buffer_folio() on our remaining pages in the !subpage + * case. If we left eb->folios[i] populated in the subpage case we'd + * double put our reference and be super sad. + */ + for (int i = 0; i < attached; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + unlock_page(folio_page(eb->folios[i], 0)); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; } + /* + * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, + * so it can be cleaned up without utlizing page->mapping. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); btrfs_release_extent_buffer(eb); - return exists; + if (ret < 0) + return ERR_PTR(ret); + ASSERT(existing_eb); + return existing_eb; } static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) @@ -3907,31 +3320,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_page_dirty(struct page *page) +static void btree_clear_folio_dirty(struct folio *folio) { - ASSERT(PageDirty(page)); - ASSERT(PageLocked(page)); - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + ASSERT(folio_test_dirty(folio)); + ASSERT(folio_test_locked(folio)); + folio_clear_dirty_for_io(folio); + xa_lock_irq(&folio->mapping->i_pages); + if (!folio_test_dirty(folio)) + __xa_clear_mark(&folio->mapping->i_pages, + folio_index(folio), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&folio->mapping->i_pages); } static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; + struct folio *folio = eb->folios[0]; bool last; - /* btree_clear_page_dirty() needs page locked */ - lock_page(page); - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, - eb->len); + /* btree_clear_folio_dirty() needs page locked. */ + folio_lock(folio); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); if (last) - btree_clear_page_dirty(page); - unlock_page(page); + btree_clear_folio_dirty(folio); + folio_unlock(folio); WARN_ON(atomic_read(&eb->refs) == 0); } @@ -3939,15 +3351,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i; - int num_pages; - struct page *page; + int num_folios; btrfs_assert_tree_write_locked(eb); if (trans && btrfs_header_generation(eb) != trans->transid) return; + /* + * Instead of clearing the dirty flag off of the buffer, mark it as + * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve + * write-ordering in zoned mode, without the need to later re-dirty + * the extent_buffer. + * + * The actual zeroout of the buffer will happen later in + * btree_csum_one_bio. + */ + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); + return; + } + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; @@ -3957,32 +3381,32 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) continue; - lock_page(page); - btree_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + btree_clear_folio_dirty(folio); + folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) { - int i; - int num_pages; + int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; @@ -3999,34 +3423,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(eb->pages[0]); - for (i = 0; i < num_pages; i++) - btrfs_page_set_dirty(eb->fs_info, eb->pages[i], - eb->start, eb->len); + lock_page(folio_page(eb->folios[0], 0)); + for (int i = 0; i < num_folios; i++) + btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], + eb->start, eb->len); if (subpage) - unlock_page(eb->pages[0]); + unlock_page(folio_page(eb->folios[0], 0)); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (i = 0; i < num_pages; i++) - ASSERT(PageDirty(eb->pages[i])); + for (int i = 0; i < num_folios; i++) + ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!page) + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + if (!folio) continue; /* @@ -4034,46 +3456,56 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - btrfs_subpage_clear_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_clear_uptodate(fs_info, folio, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; /* * This is special handling for metadata subpage, as regular * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); else - btrfs_subpage_set_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_set_uptodate(fs_info, folio, + eb->start, eb->len); } } -static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +static void clear_extent_buffer_reading(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); +} + +static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; + /* + * If the extent buffer is marked UPTODATE before the read operation + * completes, other calls to read_extent_buffer_pages() will return + * early without waiting for the read to finish, causing data races. + */ + WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); + eb->read_mirror = bbio->mirror_num; if (uptodate && @@ -4087,32 +3519,30 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { + struct folio *folio = fi.folio; u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + u32 len = fi.length; if (uptodate) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); bio_offset += len; } - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); free_extent_buffer(eb); bio_put(&bbio->bio); } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { - int num_pages = num_extent_pages(eb), i; struct btrfs_bio *bbio; + bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -4136,9 +3566,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, * will now be set, and we shouldn't read it in again. */ if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); return 0; } @@ -4149,19 +3577,26 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, - extent_buffer_read_end_io, eb); + end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); if (eb->fs_info->nodesize < PAGE_SIZE) { - __bio_add_page(&bbio->bio, eb->pages[0], eb->len, - eb->start - page_offset(eb->pages[0])); + ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, + eb->start - folio_pos(eb->folios[0])); + ASSERT(ret); } else { - for (i = 0; i < num_pages; i++) - __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); + ASSERT(ret); + } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); done: if (wait == WAIT_COMPLETE) { @@ -4177,7 +3612,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4206,29 +3641,33 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { + const int unit_size = eb->folio_size; size_t cur; size_t offset; - struct page *page; - char *kaddr; char *dst = (char *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); if (check_eb_range(eb, start, len)) { /* * Invalid range hit, reset the memory, so callers won't get - * some random garbage for their uninitialzed memory. + * some random garbage for their uninitialized memory. */ memset(dstv, 0, len); return; } - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + memcpy(dstv, eb->addr + start, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); memcpy(dst, kaddr + offset, cur); dst += cur; @@ -4242,24 +3681,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { + const int unit_size = eb->folio_size; size_t cur; size_t offset; - struct page *page; - char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (copy_to_user_nofault(dstv, eb->addr + start, len)) + ret = -EFAULT; + return ret; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; @@ -4277,25 +3721,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { + const int unit_size = eb->folio_size; size_t cur; size_t offset; - struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = get_eb_offset_in_page(eb, start); - - while (len > 0) { - page = eb->pages[i]; + if (eb->addr) + return memcmp(ptrv, eb->addr + start, len); - cur = min(len, (PAGE_SIZE - offset)); + offset = get_eb_offset_in_folio(eb, start); - kaddr = page_address(page); + while (len > 0) { + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); ret = memcmp(ptr, kaddr + offset, cur); if (ret) break; @@ -4314,10 +3758,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, * For regular sector size == PAGE_SIZE case, check if @page is uptodate. * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. */ -static void assert_eb_page_uptodate(const struct extent_buffer *eb, - struct page *page) +static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct folio *folio = eb->folios[i]; + + ASSERT(folio); /* * If we are using the commit root we could potentially clear a page @@ -4331,11 +3777,13 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, return; if (fs_info->nodesize < PAGE_SIZE) { - if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + folio = eb->folios[0]; + ASSERT(i == 0); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, eb->start, eb->len))) - btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); + btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!folio_test_uptodate(folio)); } } @@ -4343,29 +3791,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { + const int unit_size = eb->folio_size; size_t cur; size_t offset; - struct page *page; char *kaddr; - char *src = (char *)srcv; - unsigned long i = get_eb_page_index(start); + const char *src = (const char *)srcv; + unsigned long i = get_eb_folio_index(eb, start); /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); - if (check_eb_range(eb, start, len)) return; - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (use_memmove) + memmove(eb->addr + start, srcv, len); + else + memcpy(eb->addr + start, srcv, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; if (check_uptodate) - assert_eb_page_uptodate(eb, page); + assert_eb_folio_uptodate(eb, i); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (use_memmove) memmove(kaddr + offset, src, cur); else @@ -4387,16 +3840,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { + const int unit_size = eb->folio_size; unsigned long cur = start; + if (eb->addr) { + memset(eb->addr + start, c, len); + return; + } + while (cur < start + len) { - unsigned long index = get_eb_page_index(cur); - unsigned int offset = get_eb_offset_in_page(eb, cur); - unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); - struct page *page = eb->pages[index]; + unsigned long index = get_eb_folio_index(eb, cur); + unsigned int offset = get_eb_offset_in_folio(eb, cur); + unsigned int cur_len = min(start + len - cur, unit_size - offset); - assert_eb_page_uptodate(eb, page); - memset(page_address(page) + offset, c, cur_len); + assert_eb_folio_uptodate(eb, index); + memset(folio_address(eb->folios[index]) + offset, c, cur_len); cur += cur_len; } @@ -4413,15 +3871,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); while (cur < src->len) { - unsigned long index = get_eb_page_index(cur); - unsigned long offset = get_eb_offset_in_page(src, cur); - unsigned long cur_len = min(src->len, PAGE_SIZE - offset); - void *addr = page_address(src->pages[index]) + offset; + unsigned long index = get_eb_folio_index(src, cur); + unsigned long offset = get_eb_offset_in_folio(src, cur); + unsigned long cur_len = min(src->len, unit_size - offset); + void *addr = folio_address(src->folios[index]) + offset; write_extent_buffer(dst, addr, cur, cur_len); @@ -4434,12 +3893,12 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; - struct page *page; char *kaddr; - unsigned long i = get_eb_page_index(dst_offset); + unsigned long i = get_eb_folio_index(dst, dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -4447,15 +3906,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = get_eb_offset_in_page(dst, dst_offset); + offset = get_eb_offset_in_folio(dst, dst_offset); while (len > 0) { - page = dst->pages[i]; - assert_eb_page_uptodate(dst, page); + assert_eb_folio_uptodate(dst, i); - cur = min(len, (unsigned long)(PAGE_SIZE - offset)); + cur = min(len, (unsigned long)(unit_size - offset)); - kaddr = page_address(page); + kaddr = folio_address(dst->folios[i]); read_extent_buffer(src, kaddr + offset, src_offset, cur); src_offset += cur; @@ -4466,22 +3924,22 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * eb_bitmap_offset() - calculate the page and offset of the byte containing the - * given bit number - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains the - * given bit number - * @page_offset: return offset into the page given by page_index + * Calculate the folio and offset of the byte containing the given bit number. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @folio_index: return index of the folio in the extent buffer that contains + * the given bit number + * @folio_offset: return offset into the folio given by folio_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, - unsigned long *page_index, - size_t *page_offset) + unsigned long *folio_index, + size_t *folio_offset) { size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -4491,10 +3949,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_page(eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *page_index = offset >> PAGE_SHIFT; - *page_offset = offset_in_page(offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4507,25 +3965,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { - u8 *kaddr; - struct page *page; unsigned long i; size_t offset; + u8 *kaddr; eb_bitmap_offset(eb, start, nr, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + assert_eb_folio_uptodate(eb, i); + kaddr = folio_address(eb->folios[i]); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) { - unsigned long index = get_eb_page_index(bytenr); + unsigned long index = get_eb_folio_index(eb, bytenr); if (check_eb_range(eb, bytenr, 1)) return NULL; - return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); + return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); } /* @@ -4610,19 +4066,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst->addr) { + const bool use_memmove = areas_overlap(src_offset, dst_offset, len); + + if (use_memmove) + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + else + memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (cur_off < len) { unsigned long cur_src = cur_off + src_offset; - unsigned long pg_index = get_eb_page_index(cur_src); - unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long folio_index = get_eb_folio_index(dst, cur_src); + unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); unsigned long cur_len = min(src_offset + len - cur_src, - PAGE_SIZE - pg_off); - void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + unit_size - folio_off); + void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; const bool use_memmove = areas_overlap(src_offset + cur_off, dst_offset + cur_off, cur_len); @@ -4648,24 +4115,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } + if (dst->addr) { + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (len > 0) { unsigned long src_i; size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; + size_t dst_off_in_folio; + size_t src_off_in_folio; void *src_addr; bool use_memmove; - src_i = get_eb_page_index(src_end); + src_i = get_eb_folio_index(dst, src_end); - dst_off_in_page = get_eb_offset_in_page(dst, dst_end); - src_off_in_page = get_eb_offset_in_page(dst, src_end); + dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); + src_off_in_folio = get_eb_offset_in_folio(dst, src_end); - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); + cur = min_t(unsigned long, len, src_off_in_folio + 1); + cur = min(cur, dst_off_in_folio + 1); - src_addr = page_address(dst->pages[src_i]) + src_off_in_page - - cur + 1; + src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - + cur + 1; use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, cur); @@ -4680,17 +4152,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4702,7 +4174,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4716,18 +4188,18 @@ out: return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { struct extent_buffer *eb = NULL; /* - * Unlike try_release_extent_buffer() which uses page->private + * Unlike try_release_extent_buffer() which uses folio private * to grab buffer, for subpage case we rely on radix tree, thus * we need to ensure radix tree consistency. * @@ -4735,7 +4207,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4767,43 +4239,43 @@ static int try_release_subpage_extent_buffer(struct page *page) /* * Here we don't care about the return value, we will always - * check the page private at the end. And + * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); } /* - * Finally to check if we have cleared page private, as if we have - * released all ebs in the page, the page private should be cleared now. + * Finally to check if we have cleared folio private, as if we have + * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* - * We need to make sure nobody is changing page->private, as we rely on - * page->private as the pointer to extent buffer. + * We need to make sure nobody is changing folio private, as we rely on + * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&folio->mapping->i_private_lock); return 1; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); /* @@ -4814,10 +4286,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, @@ -4832,7 +4304,8 @@ int try_release_extent_buffer(struct page *page) } /* - * btrfs_readahead_tree_block - attempt to readahead a child block + * Attempt to readahead a child block. + * * @fs_info: the fs_info * @bytenr: bytenr to read * @owner_root: objectid of the root that owns this eb @@ -4871,7 +4344,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, } /* - * btrfs_readahead_node_child - readahead a node's child block + * Readahead a node's child block. + * * @node: parent node we're reading from * @slot: slot in the parent node for the child we want to read * diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 68368ba99321..039a73731135 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,11 +7,33 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/slab.h> #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" +struct page; +struct file; +struct folio; +struct inode; +struct fiemap_extent_info; +struct readahead_control; +struct address_space; +struct writeback_control; +struct extent_io_tree; +struct extent_map_tree; +struct extent_state; +struct btrfs_block_group; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, @@ -28,7 +50,8 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, - EXTENT_BUFFER_NO_CHECK, + /* Indicate the extent buffer is written zeroed out (for zoned) */ + EXTENT_BUFFER_ZONED_ZEROOUT, /* Indicate that extent buffer pages a being read */ EXTENT_BUFFER_READING, }; @@ -43,10 +66,10 @@ enum { }; /* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. + * Folio private values. Every page that is controlled by the extent map has + * folio private set to this value. */ -#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_FOLIO_PRIVATE 1 /* * The extent buffer bitmap operations are done with byte granularity instead of @@ -56,17 +79,12 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_root; -struct btrfs_inode; -struct btrfs_fs_info; -struct extent_io_tree; -struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -74,22 +92,36 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; + + /* + * The address where the eb can be accessed without any cross-page handling. + * This can be NULL if not possible. + */ + void *addr; + spinlock_t refs_lock; atomic_t refs; int read_mirror; - struct rcu_head rcu_head; - pid_t lock_owner; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; + struct rcu_head rcu_head; struct rw_semaphore lock; - struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + /* + * Pointers to all the folios of the extent buffer. + * + * For now the folio is always order 0 (aka, a single page). + */ + struct folio *folios[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; + pid_t lock_owner; #endif }; @@ -100,6 +132,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -108,29 +147,43 @@ struct btrfs_eb_write_context { * * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) +static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb + * The eb->start is aligned to folio size, thus adding it + * won't cause any difference. + * 1.2) Several page sized folios + * The eb->start is aligned to folio (page) size, thus + * adding it won't cause any difference. * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * In this case there would only be one page sized folio, and there + * may be several different extent buffers in the page/folio. + * We need to add eb->start to properly access the offset inside + * that eb. */ - return offset_in_page(offset + eb->start); + return offset_in_folio(eb->folios[0], offset + eb->start); } -static inline unsigned long get_eb_page_index(unsigned long offset) +static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb. + * the folio_shift would be large enough to always make us + * return 0 as index. + * 1.2) Several page sized folios + * The folio_shift would be PAGE_SHIFT, giving us the correct + * index. * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * The folio would only be page sized, and always give us 0 as index. */ - return offset >> PAGE_SHIFT; + return offset >> eb->folio_shift; } /* @@ -162,6 +215,11 @@ static inline struct extent_changeset *extent_changeset_alloc(void) return ret; } +static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask) +{ + ulist_prealloc(&changeset->range_changed, gfp_mask); +} + static inline void extent_changeset_release(struct extent_changeset *changeset) { if (!changeset) @@ -178,24 +236,20 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -struct extent_map_tree; - -int try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc); +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len); +void btrfs_readahead(struct readahead_control *rac); +int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); @@ -212,7 +266,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - struct btrfs_tree_parent_check *parent_check); + const struct btrfs_tree_parent_check *parent_check); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); @@ -230,6 +284,22 @@ static inline int num_extent_pages(const struct extent_buffer *eb) return (eb->len >> PAGE_SHIFT) ?: 1; } +/* + * This can only be determined at runtime by checking eb::folios[0]. + * + * As we can have either one large folio covering the whole eb + * (either nodesize <= PAGE_SIZE, or high order folio), or multiple + * single-paged folios. + */ +static inline int num_extent_folios(const struct extent_buffer *eb) +{ + if (!eb->folios[0]) + return 0; + if (folio_order(eb->folios[0])) + return 1; + return num_extent_pages(eb); +} + static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -285,20 +355,22 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - struct page *locked_page, + const struct folio *locked_folio, + struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + bool nofail); +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8c017c4105f2..36af9aa9aab1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -5,10 +5,10 @@ #include <linux/spinlock.h> #include "messages.h" #include "ctree.h" -#include "volumes.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" +#include "disk-io.h" static struct kmem_cache *extent_map_cache; @@ -16,8 +16,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", - sizeof(struct extent_map), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -34,7 +33,7 @@ void __cold extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree) { - tree->map = RB_ROOT_CACHED; + tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -50,7 +49,6 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->compress_type = BTRFS_COMPRESS_NONE; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -67,8 +65,6 @@ void free_extent_map(struct extent_map *em) if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } @@ -81,27 +77,32 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static int tree_insert(struct rb_root_cached *root, struct extent_map *em) +static void dec_evictable_extent_maps(struct btrfs_inode *inode) { - struct rb_node **p = &root->rb_root.rb_node; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + percpu_counter_dec(&fs_info->evictable_extent_maps); +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) +{ + struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); - bool leftmost = true; while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - if (em->start < entry->start) { + if (em->start < entry->start) p = &(*p)->rb_left; - } else if (em->start >= extent_map_end(entry)) { + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; - leftmost = false; - } else { + else return -EEXIST; - } } orig_parent = parent; @@ -124,7 +125,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); - rb_insert_color_cached(&em->rb_node, root, leftmost); + rb_insert_color(&em->rb_node, root); return 0; } @@ -182,54 +183,163 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* Check to see if two extent_map structs are adjacent and safe to merge. */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) +static inline u64 extent_map_block_len(const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) - return 0; + if (extent_map_is_compressed(em)) + return em->disk_num_bytes; + return em->len; +} - /* - * don't merge compressed extents, we need to know their - * actual size - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) - return 0; +static inline u64 extent_map_block_end(const struct extent_map *em) +{ + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); - if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || - test_bit(EXTENT_FLAG_LOGGING, &next->flags)) - return 0; + if (block_end < block_start) + return (u64)-1; + + return block_end; +} + +static bool can_merge_extent_map(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_PINNED) + return false; + + /* Don't merge compressed extents, we need to know their actual size. */ + if (extent_map_is_compressed(em)) + return false; + + if (em->flags & EXTENT_FLAG_LOGGING) + return false; /* * We don't want to merge stuff that hasn't been written to the log yet * since it may not reflect exactly what is on disk, and that would be * bad. */ - if (!list_empty(&prev->list) || !list_empty(&next->list)) - return 0; + if (!list_empty(&em->list)) + return false; + + return true; +} + +/* Check to see if two extent_map structs are adjacent and safe to merge. */ +static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) +{ + if (extent_map_end(prev) != next->start) + return false; + + /* + * The merged flag is not an on-disk flag, it just indicates we had the + * extent maps of 2 (or more) adjacent extents merged, so factor it out. + */ + if ((prev->flags & ~EXTENT_FLAG_MERGED) != + (next->flags & ~EXTENT_FLAG_MERGED)) + return false; + + if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) + return extent_map_block_start(next) == extent_map_block_end(prev); + + /* HOLES and INLINE extents. */ + return next->disk_bytenr == prev->disk_bytenr; +} + +/* + * Handle the on-disk data extents merge for @prev and @next. + * + * @prev: left extent to merge + * @next: right extent to merge + * @merged: the extent we will not discard after the merge; updated with new values + * + * After this, one of the two extents is the new merged extent and the other is + * removed from the tree and likely freed. Note that @merged is one of @prev/@next + * so there is const/non-const aliasing occurring here. + * + * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. + * For now only uncompressed regular extent can be merged. + */ +static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, + struct extent_map *merged) +{ + u64 new_disk_bytenr; + u64 new_disk_num_bytes; + u64 new_offset; + + /* @prev and @next should not be compressed. */ + ASSERT(!extent_map_is_compressed(prev)); + ASSERT(!extent_map_is_compressed(next)); + + /* + * There are two different cases where @prev and @next can be merged. + * + * 1) They are referring to the same data extent: + * + * |<----- data extent A ----->| + * |<- prev ->|<- next ->| + * + * 2) They are referring to different data extents but still adjacent: + * + * |<-- data extent A -->|<-- data extent B -->| + * |<- prev ->|<- next ->| + * + * The calculation here always merges the data extents first, then updates + * @offset using the new data extents. + * + * For case 1), the merged data extent would be the same. + * For case 2), we just merge the two data extents into one. + */ + new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr); + new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes, + next->disk_bytenr + next->disk_num_bytes) - + new_disk_bytenr; + new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; + + merged->disk_bytenr = new_disk_bytenr; + merged->disk_num_bytes = new_disk_num_bytes; + merged->ram_bytes = new_disk_num_bytes; + merged->offset = new_offset; +} - ASSERT(next->block_start != EXTENT_MAP_DELALLOC && - prev->block_start != EXTENT_MAP_DELALLOC); - - if (prev->map_lookup || next->map_lookup) - ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && - test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); - - if (extent_map_end(prev) == next->start && - prev->flags == next->flags && - prev->map_lookup == next->map_lookup && - ((next->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (next->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && - next->block_start == extent_map_block_end(prev)))) { - return 1; +static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, + struct extent_map *em) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return; + btrfs_crit(fs_info, +"%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x", + prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes, + em->ram_bytes, em->offset, em->flags); + ASSERT(0); +} + +/* Internal sanity checks for btrfs debug builds. */ +static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return; + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + if (em->disk_num_bytes == 0) + dump_extent_map(fs_info, "zero disk_num_bytes", em); + if (em->offset + em->len > em->ram_bytes) + dump_extent_map(fs_info, "ram_bytes too small", em); + if (em->offset + em->len > em->disk_num_bytes && + !extent_map_is_compressed(em)) + dump_extent_map(fs_info, "disk_num_bytes too small", em); + if (!extent_map_is_compressed(em) && + em->ram_bytes != em->disk_num_bytes) + dump_extent_map(fs_info, + "ram_bytes mismatch with disk_num_bytes for non-compressed em", + em); + } else if (em->offset) { + dump_extent_map(fs_info, "non-zero offset for hole/inline", em); } - return 0; } -static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -244,46 +354,51 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (refcount_read(&em->refs) > 2) return; + if (!can_merge_extent_map(em)) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; - em->orig_start = merge->orig_start; em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; - em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); - rb_erase_cached(&merge->rb_node, &tree->map); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + merge_ondisk_extents(merge, em, em); + em->flags |= EXTENT_FLAG_MERGED; + + validate_extent_map(fs_info, em); + rb_erase(&merge->rb_node, &tree->root); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); + dec_evictable_extent_maps(inode); } } rb = rb_next(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->block_len; - rb_erase_cached(&merge->rb_node, &tree->map); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + merge_ondisk_extents(em, merge, em); + validate_extent_map(fs_info, em); + rb_erase(&merge->rb_node, &tree->root); RB_CLEAR_NODE(&merge->rb_node); - em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); + dec_evictable_extent_maps(inode); } } /* * Unpin an extent from the cache. * - * @tree: tree to unpin the extent in + * @inode: the inode from which we are unpinning an extent range * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in @@ -291,133 +406,108 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). + * + * Returns: 0 on success + * -ENOENT when the extent is not found in the tree + * -EUCLEAN if the found extent does not match the expected start */ -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, - u64 gen) +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; - bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(!em || em->start != start); - - if (!em) + if (WARN_ON(!em)) { + btrfs_warn(fs_info, +"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + start, start + len, gen); + ret = -ENOENT; goto out; + } - em->generation = gen; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - em->mod_start = em->start; - em->mod_len = em->len; - - if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { - prealloc = true; - clear_bit(EXTENT_FLAG_FILLING, &em->flags); + if (WARN_ON(em->start != start)) { + btrfs_warn(fs_info, +"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + em->start, start, start + len, gen); + ret = -EUCLEAN; + goto out; } - try_merge_map(tree, em); + em->generation = gen; + em->flags &= ~EXTENT_FLAG_PINNED; - if (prealloc) { - em->mod_start = em->start; - em->mod_len = em->len; - } + try_merge_map(inode, em); - free_extent_map(em); out: write_unlock(&tree->lock); + free_extent_map(em); return ret; } -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { - lockdep_assert_held_write(&tree->lock); + lockdep_assert_held_write(&inode->extent_tree.lock); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) - try_merge_map(tree, em); + try_merge_map(inode, em); } -static inline void setup_extent_mapping(struct extent_map_tree *tree, +static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { refcount_inc(&em->refs); - em->mod_start = em->start; - em->mod_len = em->len; + + ASSERT(list_empty(&em->list)); if (modified) - list_move(&em->list, &tree->modified_extents); + list_add(&em->list, &inode->extent_tree.modified_extents); else - try_merge_map(tree, em); -} - -static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); - } -} - -static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); - } + try_merge_map(inode, em); } /* - * Add new extent map to the extent tree + * Add a new extent map to an inode's extent map tree. * - * @tree: tree to insert new map in + * @inode: the target inode * @em: map to insert * @modified: indicate whether the given @em should be added to the * modified list, which indicates the extent needs to be logged * - * Insert @em into @tree or perform a simple forward/backward merge with - * existing mappings. The extent_map struct passed in will be inserted - * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successful. + * Insert @em into the @inode's extent map tree or perform a simple + * forward/backward merge with existing mappings. The extent_map struct passed + * in will be inserted into the tree directly, with an additional reference + * taken, or a reference dropped if the merge attempt was successful. */ -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified) +static int add_extent_mapping(struct btrfs_inode *inode, + struct extent_map *em, int modified) { - int ret = 0; + struct extent_map_tree *tree = &inode->extent_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; lockdep_assert_held_write(&tree->lock); - ret = tree_insert(&tree->map, em); + validate_extent_map(fs_info, em); + ret = tree_insert(&tree->root, em); if (ret) - goto out; + return ret; - setup_extent_mapping(tree, em, modified); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { - extent_map_device_set_bits(em, CHUNK_ALLOCATED); - extent_map_device_clear_bits(em, CHUNK_TRIMMED); - } -out: - return ret; + setup_extent_mapping(inode, em, modified); + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + percpu_counter_inc(&fs_info->evictable_extent_maps); + + return 0; } static struct extent_map * @@ -429,7 +519,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); + rb_node = __tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -483,42 +573,49 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, } /* - * Remove an extent_map from the extent tree. + * Remove an extent_map from its inode's extent tree. * - * @tree: extent tree to remove from + * @inode: the inode the extent map belongs to * @em: extent map being removed * - * Remove @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use. + * Remove @em from the extent tree of @inode. No reference counts are dropped, + * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - rb_erase_cached(&em->rb_node, &tree->map); - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + WARN_ON(em->flags & EXTENT_FLAG_PINNED); + rb_erase(&em->rb_node, &tree->root); + if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); + + dec_evictable_extent_maps(inode); } -static void replace_extent_mapping(struct extent_map_tree *tree, +static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, int modified) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + validate_extent_map(fs_info, new); + + WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(extent_map_in_tree(cur)); - if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); - rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); RB_CLEAR_NODE(&cur->rb_node); - setup_extent_mapping(tree, new, modified); + setup_extent_mapping(inode, new, modified); } static struct extent_map *next_extent_map(const struct extent_map *em) @@ -547,7 +644,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em) * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. */ -static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, +static noinline int merge_extent_mapping(struct btrfs_inode *inode, struct extent_map *existing, struct extent_map *em, u64 map_start) @@ -558,7 +655,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, u64 end; u64 start_diff; - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + if (map_start < em->start || map_start >= extent_map_end(em)) + return -EINVAL; if (existing->start > map_start) { next = existing; @@ -575,19 +673,15 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, start_diff = start - em->start; em->start = start; em->len = end - start; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - em->block_start += start_diff; - em->block_len = em->len; - } - return add_extent_mapping(em_tree, em, 0); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + em->offset += start_diff; + return add_extent_mapping(inode, em, 0); } /* - * Add extent mapping into em_tree. + * Add extent mapping into an inode's extent map tree. * - * @fs_info: the filesystem - * @em_tree: extent tree into which we want to insert the extent mapping + * @inode: target inode * @em_in: extent we are inserting * @start: start of the logical range btrfs_get_extent() is requesting * @len: length of the logical range btrfs_get_extent() is requesting @@ -595,8 +689,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. * - * Insert @em_in into @em_tree. In case there is an overlapping range, handle - * the -EEXIST by either: + * Insert @em_in into the inode's extent map tree. In case there is an + * overlapping range, handle the -EEXIST by either: * a) Returning the existing extent in @em_in if @start is within the * existing em. * b) Merge the existing extent with @em_in passed in. @@ -604,21 +698,21 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len) { int ret; struct extent_map *em = *em_in; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* * Tree-checker should have rejected any inline extent with non-zero * file offset. Here just do a sanity check. */ - if (em->block_start == EXTENT_MAP_INLINE) + if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(em_tree, em, 0); + ret = add_extent_mapping(inode, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -626,9 +720,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - ret = 0; - - existing = search_extent_mapping(em_tree, start, len); + existing = search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -649,15 +741,14 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * The existing extent map is the one nearest to * the [start, start + len) range which overlaps */ - ret = merge_extent_mapping(em_tree, existing, - em, start); - if (ret) { + ret = merge_extent_mapping(inode, existing, em, start); + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", - ret, existing->start, existing->len, - orig_start, orig_len); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } @@ -672,20 +763,26 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * if needed. This avoids searching the tree, from the root down to the first * extent map, before each deletion. */ -static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +static void drop_all_extent_maps_fast(struct btrfs_inode *inode) { + struct extent_map_tree *tree = &inode->extent_tree; + struct rb_node *node; + write_lock(&tree->lock); - while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + node = rb_first(&tree->root); + while (node) { struct extent_map *em; - struct rb_node *node; + struct rb_node *next = rb_next(node); - node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - remove_extent_mapping(tree, em); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); + remove_extent_mapping(inode, em); free_extent_map(em); - cond_resched_rwlock_write(&tree->lock); + + if (cond_resched_rwlock_write(&tree->lock)) + node = rb_first(&tree->root); + else + node = next; } write_unlock(&tree->lock); } @@ -716,7 +813,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, WARN_ON(end < start); if (end == (u64)-1) { if (start == 0 && !skip_pinned) { - drop_all_extent_maps_fast(em_tree); + drop_all_extent_maps_fast(inode); return; } len = (u64)-1; @@ -746,7 +843,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, u64 gen; unsigned long flags; bool modified; - bool compressed; if (em_end < end) { next_em = next_extent_map(em); @@ -758,19 +854,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } } - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { start = em_end; goto next; } flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); /* * In case we split the extent map, we want to preserve the * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want * it on the new extent maps. */ - clear_bit(EXTENT_FLAG_LOGGING, &flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); modified = !list_empty(&em->list); /* @@ -781,7 +876,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto remove_em; gen = em->generation; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); if (em->start < start) { if (!split) { @@ -793,29 +887,21 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->start = em->start; split->len = start - em->start; - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + split->disk_bytenr = em->disk_bytenr; + split->disk_num_bytes = em->disk_num_bytes; + split->offset = em->offset; split->ram_bytes = em->ram_bytes; } else { - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; + split->disk_bytenr = em->disk_bytenr; + split->disk_num_bytes = 0; + split->offset = 0; split->ram_bytes = split->len; } split->generation = gen; split->flags = flags; - split->compress_type = em->compress_type; - replace_extent_mapping(em_tree, em, split, modified); + replace_extent_mapping(inode, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -829,41 +915,26 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } split->start = end; split->len = em_end - end; - split->block_start = em->block_start; + split->disk_bytenr = em->disk_bytenr; split->flags = flags; - split->compress_type = em->compress_type; split->generation = gen; - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_block_len = max(em->block_len, - em->orig_block_len); - + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + split->disk_num_bytes = em->disk_num_bytes; + split->offset = em->offset + end - em->start; split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->orig_start = em->orig_start; - } else { - const u64 diff = end - em->start; - - split->block_len = split->len; - split->block_start += diff; - split->orig_start = em->orig_start; - } } else { + split->disk_num_bytes = 0; + split->offset = 0; split->ram_bytes = split->len; - split->orig_start = split->start; - split->block_len = 0; - split->orig_block_len = 0; } if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); + replace_extent_mapping(inode, em, split, modified); } else { int ret; - ret = add_extent_mapping(em_tree, split, - modified); + ret = add_extent_mapping(inode, split, modified); /* Logic error, shouldn't happen. */ ASSERT(ret == 0); if (WARN_ON(ret != 0) && modified) @@ -898,7 +969,7 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); } /* @@ -953,7 +1024,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, do { btrfs_drop_extent_map_range(inode, new_em->start, end, false); write_lock(&tree->lock); - ret = add_extent_mapping(tree, new_em, modified); + ret = add_extent_mapping(inode, new_em, modified); write_unlock(&tree->lock); } while (ret == -EEXIST); @@ -997,28 +1068,26 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, } ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); + ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); + ASSERT(em->flags & EXTENT_FLAG_PINNED); + ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; split_pre->len = pre; - split_pre->orig_start = split_pre->start; - split_pre->block_start = new_logical; - split_pre->block_len = split_pre->len; - split_pre->orig_block_len = split_pre->block_len; + split_pre->disk_bytenr = new_logical; + split_pre->disk_num_bytes = split_pre->len; + split_pre->offset = 0; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; - split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -1028,15 +1097,13 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; + split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_num_bytes = split_mid->len; + split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; - split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); + add_extent_mapping(inode, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -1051,3 +1118,303 @@ out_free_pre: free_extent_map(split_pre); return ret; } + +struct btrfs_em_shrink_ctx { + long nr_to_scan; + long scanned; + u64 last_ino; + u64 last_root; +}; + +static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); + struct extent_map_tree *tree = &inode->extent_tree; + long nr_dropped = 0; + struct rb_node *node; + + lockdep_assert_held_write(&tree->lock); + + /* + * Take the mmap lock so that we serialize with the inode logging phase + * of fsync because we may need to set the full sync flag on the inode, + * in case we have to remove extent maps in the tree's list of modified + * extents. If we set the full sync flag in the inode while an fsync is + * in progress, we may risk missing new extents because before the flag + * is set, fsync decides to only wait for writeback to complete and then + * during inode logging it sees the flag set and uses the subvolume tree + * to find new extents, which may not be there yet because ordered + * extents haven't completed yet. + * + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. + */ + if (!down_read_trylock(&inode->i_mmap_lock)) + return 0; + + node = rb_first(&tree->root); + while (node) { + struct rb_node *next = rb_next(node); + struct extent_map *em; + + em = rb_entry(node, struct extent_map, rb_node); + ctx->scanned++; + + if (em->flags & EXTENT_FLAG_PINNED) + goto next; + + /* + * If the inode is in the list of modified extents (new) and its + * generation is the same (or is greater than) the current fs + * generation, it means it was not yet persisted so we have to + * set the full sync flag so that the next fsync will not miss + * it. + */ + if (!list_empty(&em->list) && em->generation >= cur_fs_gen) + btrfs_set_inode_full_sync(inode); + + remove_extent_mapping(inode, em); + trace_btrfs_extent_map_shrinker_remove_em(inode, em); + /* Drop the reference for the tree. */ + free_extent_map(em); + nr_dropped++; +next: + if (ctx->scanned >= ctx->nr_to_scan) + break; + + /* + * Stop if we need to reschedule or there's contention on the + * lock. This is to avoid slowing other tasks trying to take the + * lock. + */ + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) + break; + node = next; + } + up_read(&inode->i_mmap_lock); + + return nr_dropped; +} + +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + +static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_inode *inode; + long nr_dropped = 0; + u64 min_ino = ctx->last_ino + 1; + + inode = find_first_inode_to_shrink(root, min_ino); + while (inode) { + nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); + + min_ino = btrfs_ino(inode) + 1; + ctx->last_ino = btrfs_ino(inode); + iput(&inode->vfs_inode); + + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(fs_info)) + break; + + cond_resched(); + + inode = find_first_inode_to_shrink(root, min_ino); + } + + if (inode) { + /* + * There are still inodes in this root or we happened to process + * the last one and reached the scan limit. In either case set + * the current root to this one, so we'll resume from the next + * inode if there is one or we will find out this was the last + * one and move to the next root. + */ + ctx->last_root = btrfs_root_id(root); + } else { + /* + * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so + * that when processing the next root we start from its first inode. + */ + ctx->last_ino = 0; + ctx->last_root = btrfs_root_id(root) + 1; + } + + return nr_dropped; +} + +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_em_shrink_ctx ctx; + u64 start_root_id; + u64 next_root_id; + bool cycled = false; + long nr_dropped = 0; + + fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work); + + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); + + /* + * In case we have multiple tasks running this shrinker, make the next + * one start from the next inode in case it starts before we finish. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + ctx.last_ino = fs_info->extent_map_shrinker_last_ino; + fs_info->extent_map_shrinker_last_ino++; + ctx.last_root = fs_info->extent_map_shrinker_last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + + start_root_id = ctx.last_root; + next_root_id = ctx.last_root; + + if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan, + nr, ctx.last_root, + ctx.last_ino); + } + + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { + struct btrfs_root *root; + unsigned long count; + + cond_resched(); + + spin_lock(&fs_info->fs_roots_radix_lock); + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)&root, + (unsigned long)next_root_id, 1); + if (count == 0) { + spin_unlock(&fs_info->fs_roots_radix_lock); + if (start_root_id > 0 && !cycled) { + next_root_id = 0; + ctx.last_root = 0; + ctx.last_ino = 0; + cycled = true; + continue; + } + break; + } + next_root_id = btrfs_root_id(root) + 1; + root = btrfs_grab_root(root); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (!root) + continue; + + if (is_fstree(btrfs_root_id(root))) + nr_dropped += btrfs_scan_root(root, &ctx); + + btrfs_put_root(root); + } + + /* + * In case of multiple tasks running this extent map shrinking code this + * isn't perfect but it's simple and silences things like KCSAN. It's + * not possible to know which task made more progress because we can + * cycle back to the first root and first inode if it's not the first + * time the shrinker ran, see the above logic. Also a task that started + * later may finish ealier than another task and made less progress. So + * make this simple and update to the progress of the last task that + * finished, with the occasional possiblity of having two consecutive + * runs of the shrinker process the same inodes. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + fs_info->extent_map_shrinker_last_ino = ctx.last_ino; + fs_info->extent_map_shrinker_last_root = ctx.last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + + if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, + nr, ctx.last_root, + ctx.last_ino); + } + + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 35d27c756e08..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,45 +3,91 @@ #ifndef BTRFS_EXTENT_MAP_H #define BTRFS_EXTENT_MAP_H +#include <linux/compiler_types.h> +#include <linux/spinlock_types.h> #include <linux/rbtree.h> +#include <linux/list.h> #include <linux/refcount.h> +#include "misc.h" +#include "compression.h" + +struct btrfs_inode; +struct btrfs_fs_info; #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) -/* used only during fiemap calls */ -#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ enum { /* this entry not yet on disk, don't free it */ - EXTENT_FLAG_PINNED, - EXTENT_FLAG_COMPRESSED, + ENUM_BIT(EXTENT_FLAG_PINNED), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB), + ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD), /* pre-allocated extent */ - EXTENT_FLAG_PREALLOC, + ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ - EXTENT_FLAG_LOGGING, - /* Filling in a preallocated extent */ - EXTENT_FLAG_FILLING, - /* filesystem extent mapping type */ - EXTENT_FLAG_FS_MAPPING, + ENUM_BIT(EXTENT_FLAG_LOGGING), /* This em is merged from two or more physically adjacent ems */ - EXTENT_FLAG_MERGED, + ENUM_BIT(EXTENT_FLAG_MERGED), }; +/* + * This structure represents file extents and holes. + * + * Unlike on-disk file extent items, extent maps can be merged to save memory. + * This means members only match file extent items before any merging. + * + * Keep this structure as compact as possible, as we can have really large + * amounts of allocated extent maps at any time. + */ struct extent_map { struct rb_node rb_node; - /* all of these are in bytes */ + /* All of these are in bytes. */ + + /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */ u64 start; + + /* + * Length of the file extent. + * + * For non-inlined file extents it's btrfs_file_extent_item::num_bytes. + * For inline extents it's sectorsize, since inline data starts at + * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus + * btrfs_file_extent_item::num_bytes is not valid. + */ u64 len; - u64 mod_start; - u64 mod_len; - u64 orig_start; - u64 orig_block_len; + + /* + * The bytenr of the full on-disk extent. + * + * For regular extents it's btrfs_file_extent_item::disk_bytenr. + * For holes it's EXTENT_MAP_HOLE and for inline extents it's + * EXTENT_MAP_INLINE. + */ + u64 disk_bytenr; + + /* + * The full on-disk extent length, matching + * btrfs_file_extent_item::disk_num_bytes. + */ + u64 disk_num_bytes; + + /* + * Offset inside the decompressed extent. + * + * For regular extents it's btrfs_file_extent_item::offset. + * For holes and inline extents it's 0. + */ + u64 offset; + + /* + * The decompressed size of the whole on-disk extent, matching + * btrfs_file_extent_item::ram_bytes. + */ u64 ram_bytes; - u64 block_start; - u64 block_len; /* * Generation of the extent map, for merged em it's the highest @@ -49,47 +95,81 @@ struct extent_map { * For non-merged extents, it's from btrfs_file_extent_item::generation. */ u64 generation; - unsigned long flags; - /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ - struct map_lookup *map_lookup; + u32 flags; refcount_t refs; - unsigned int compress_type; struct list_head list; }; struct extent_map_tree { - struct rb_root_cached map; + struct rb_root root; struct list_head modified_extents; rwlock_t lock; }; struct btrfs_inode; +static inline void extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) +{ + if (type == BTRFS_COMPRESS_ZLIB) + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + else if (type == BTRFS_COMPRESS_LZO) + em->flags |= EXTENT_FLAG_COMPRESS_LZO; + else if (type == BTRFS_COMPRESS_ZSTD) + em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; +} + +static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) + return BTRFS_COMPRESS_ZLIB; + + if (em->flags & EXTENT_FLAG_COMPRESS_LZO) + return BTRFS_COMPRESS_LZO; + + if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD) + return BTRFS_COMPRESS_ZSTD; + + return BTRFS_COMPRESS_NONE; +} + +/* + * More efficient way to determine if extent is compressed, instead of using + * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. + */ +static inline bool extent_map_is_compressed(const struct extent_map *em) +{ + return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | + EXTENT_FLAG_COMPRESS_LZO | + EXTENT_FLAG_COMPRESS_ZSTD)) != 0; +} + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_end(struct extent_map *em) +static inline u64 extent_map_block_start(const struct extent_map *em) { - if (em->start + em->len < em->start) - return (u64)-1; - return em->start + em->len; + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + if (extent_map_is_compressed(em)) + return em->disk_bytenr; + return em->disk_bytenr + em->offset; + } + return em->disk_bytenr; } -static inline u64 extent_map_block_end(struct extent_map *em) +static inline u64 extent_map_end(const struct extent_map *em) { - if (em->block_start + em->block_len < em->block_start) + if (em->start + em->len < em->start) return (u64)-1; - return em->block_start + em->block_len; + return em->start + em->len; } void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified); -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); @@ -97,12 +177,11 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, @@ -110,5 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c new file mode 100644 index 000000000000..df7f09f3b02e --- /dev/null +++ b/fs/btrfs/fiemap.c @@ -0,0 +1,930 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "backref.h" +#include "btrfs_inode.h" +#include "fiemap.h" +#include "file.h" +#include "file-item.h" + +struct btrfs_fiemap_entry { + u64 offset; + u64 phys; + u64 len; + u32 flags; +}; + +/* + * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file + * range from the inode's io tree, unlock the subvolume tree search path, flush + * the fiemap cache and relock the file range and research the subvolume tree. + * The value here is something negative that can't be confused with a valid + * errno value and different from 1 because that's also a return value from + * fiemap_fill_next_extent() and also it's often used to mean some btree search + * did not find a key, so make it some distinct negative value. + */ +#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) + +/* + * Used to: + * + * - Cache the next entry to be emitted to the fiemap buffer, so that we can + * merge extents that are contiguous and can be grouped as a single one; + * + * - Store extents ready to be written to the fiemap buffer in an intermediary + * buffer. This intermediary buffer is to ensure that in case the fiemap + * buffer is memory mapped to the fiemap target file, we don't deadlock + * during btrfs_page_mkwrite(). This is because during fiemap we are locking + * an extent range in order to prevent races with delalloc flushing and + * ordered extent completion, which is needed in order to reliably detect + * delalloc in holes and prealloc extents. And this can lead to a deadlock + * if the fiemap buffer is memory mapped to the file we are running fiemap + * against (a silly, useless in practice scenario, but possible) because + * btrfs_page_mkwrite() will try to lock the same extent range. + */ +struct fiemap_cache { + /* An array of ready fiemap entries. */ + struct btrfs_fiemap_entry *entries; + /* Number of entries in the entries array. */ + int entries_size; + /* Index of the next entry in the entries array to write to. */ + int entries_pos; + /* + * Once the entries array is full, this indicates what's the offset for + * the next file extent item we must search for in the inode's subvolume + * tree after unlocking the extent range in the inode's io tree and + * releasing the search path. + */ + u64 next_search_offset; + /* + * This matches struct fiemap_extent_info::fi_mapped_extents, we use it + * to count ourselves emitted extents and stop instead of relying on + * fiemap_fill_next_extent() because we buffer ready fiemap entries at + * the @entries array, and we want to stop as soon as we hit the max + * amount of extents to map, not just to save time but also to make the + * logic at extent_fiemap() simpler. + */ + unsigned int extents_mapped; + /* Fields for the cached extent (unsubmitted, not ready, extent). */ + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + for (int i = 0; i < cache->entries_pos; i++) { + struct btrfs_fiemap_entry *entry = &cache->entries[i]; + int ret; + + ret = fiemap_fill_next_extent(fieinfo, entry->offset, + entry->phys, entry->len, + entry->flags); + /* + * Ignore 1 (reached max entries) because we keep track of that + * ourselves in emit_fiemap_extent(). + */ + if (ret < 0) + return ret; + } + cache->entries_pos = 0; + + return 0; +} + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + struct btrfs_fiemap_entry *entry; + u64 cache_end; + + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + + if (!cache->cached) + goto assign; + + /* + * When iterating the extents of the inode, at extent_fiemap(), we may + * find an extent that starts at an offset behind the end offset of the + * previous extent we processed. This happens if fiemap is called + * without FIEMAP_FLAG_SYNC and there are ordered extents completing + * after we had to unlock the file range, release the search path, emit + * the fiemap extents stored in the buffer (cache->entries array) and + * the lock the remainder of the range and re-search the btree. + * + * For example we are in leaf X processing its last item, which is the + * file extent item for file range [512K, 1M[, and after + * btrfs_next_leaf() releases the path, there's an ordered extent that + * completes for the file range [768K, 2M[, and that results in trimming + * the file extent item so that it now corresponds to the file range + * [512K, 768K[ and a new file extent item is inserted for the file + * range [768K, 2M[, which may end up as the last item of leaf X or as + * the first item of the next leaf - in either case btrfs_next_leaf() + * will leave us with a path pointing to the new extent item, for the + * file range [768K, 2M[, since that's the first key that follows the + * last one we processed. So in order not to report overlapping extents + * to user space, we trim the length of the previously cached extent and + * emit it. + * + * Upon calling btrfs_next_leaf() we may also find an extent with an + * offset smaller than or equals to cache->offset, and this happens + * when we had a hole or prealloc extent with several delalloc ranges in + * it, but after btrfs_next_leaf() released the path, delalloc was + * flushed and the resulting ordered extents were completed, so we can + * now have found a file extent item for an offset that is smaller than + * or equals to what we have in cache->offset. We deal with this as + * described below. + */ + cache_end = cache->offset + cache->len; + if (cache_end > offset) { + if (offset == cache->offset) { + /* + * We cached a dealloc range (found in the io tree) for + * a hole or prealloc extent and we have now found a + * file extent item for the same offset. What we have + * now is more recent and up to date, so discard what + * we had in the cache and use what we have just found. + */ + goto assign; + } else if (offset > cache->offset) { + /* + * The extent range we previously found ends after the + * offset of the file extent item we found and that + * offset falls somewhere in the middle of that previous + * extent range. So adjust the range we previously found + * to end at the offset of the file extent item we have + * just found, since this extent is more up to date. + * Emit that adjusted range and cache the file extent + * item we have just found. This corresponds to the case + * where a previously found file extent item was split + * due to an ordered extent completing. + */ + cache->len = offset - cache->offset; + goto emit; + } else { + const u64 range_end = offset + len; + + /* + * The offset of the file extent item we have just found + * is behind the cached offset. This means we were + * processing a hole or prealloc extent for which we + * have found delalloc ranges (in the io tree), so what + * we have in the cache is the last delalloc range we + * found while the file extent item we found can be + * either for a whole delalloc range we previously + * emmitted or only a part of that range. + * + * We have two cases here: + * + * 1) The file extent item's range ends at or behind the + * cached extent's end. In this case just ignore the + * current file extent item because we don't want to + * overlap with previous ranges that may have been + * emmitted already; + * + * 2) The file extent item starts behind the currently + * cached extent but its end offset goes beyond the + * end offset of the cached extent. We don't want to + * overlap with a previous range that may have been + * emmitted already, so we emit the currently cached + * extent and then partially store the current file + * extent item's range in the cache, for the subrange + * going the cached extent's end to the end of the + * file extent item. + */ + if (range_end <= cache_end) + return 0; + + if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) + phys += cache_end - offset; + + offset = cache_end; + len = range_end - cache_end; + goto emit; + } + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + cache->flags == flags) { + cache->len += len; + return 0; + } + +emit: + /* Not mergeable, need to submit cached one */ + + if (cache->entries_pos == cache->entries_size) { + /* + * We will need to research for the end offset of the last + * stored extent and not from the current offset, because after + * unlocking the range and releasing the path, if there's a hole + * between that end offset and this current offset, a new extent + * may have been inserted due to a new write, so we don't want + * to miss it. + */ + entry = &cache->entries[cache->entries_size - 1]; + cache->next_search_offset = entry->offset + entry->len; + cache->cached = false; + + return BTRFS_FIEMAP_FLUSH_CACHE; + } + + entry = &cache->entries[cache->entries_pos]; + entry->offset = cache->offset; + entry->phys = cache->phys; + entry->len = cache->len; + entry->flags = cache->flags; + cache->entries_pos++; + cache->extents_mapped++; + + if (cache->extents_mapped == fieinfo->fi_extents_max) { + cache->cached = false; + return 1; + } +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; + + return 0; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) +{ + struct extent_buffer *clone = path->nodes[0]; + struct btrfs_key key; + int slot; + int ret; + + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; + + /* + * Add a temporary extra ref to an already cloned extent buffer to + * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid + * the cost of allocating a new one. + */ + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); + atomic_inc(&clone->refs); + + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + goto out; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { + ret = 1; + goto out; + } + + /* + * Important to preserve the start field, for the optimizations when + * checking if extents are shared (see extent_fiemap()). + * + * We must set ->start before calling copy_extent_buffer_full(). If we + * are on sub-pagesize blocksize, we use ->start to determine the offset + * into the folio where our eb exists, and if we update ->start after + * the fact then any subsequent reads of the eb may read from a + * different offset in the folio than where we originally copied into. + */ + clone->start = path->nodes[0]->start; + /* See the comment at fiemap_search_slot() about why we clone. */ + copy_extent_buffer_full(clone, path->nodes[0]); + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; +out: + if (ret) + free_extent_buffer(clone); + + return ret; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + } + + /* + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we check if extents are shared, as backref walking may need to + * lock the same leaf we are processing. + */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, + struct btrfs_backref_share_check_ctx *backref_ctx, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; + + /* + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. + */ + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + /* + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. + */ + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); + } + + /* + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. + */ + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + } + + return 0; +} + +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +static int extent_fiemap(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; + struct btrfs_path *path; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_share_check_ctx *backref_ctx; + u64 last_extent_end = 0; + u64 prev_extent_end; + u64 range_start; + u64 range_end; + const u64 sectorsize = inode->root->fs_info->sectorsize; + bool stopped = false; + int ret; + + cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); + cache.entries = kmalloc_array(cache.entries_size, + sizeof(struct btrfs_fiemap_entry), + GFP_KERNEL); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + path = btrfs_alloc_path(); + if (!cache.entries || !backref_ctx || !path) { + ret = -ENOMEM; + goto out; + } + +restart: + range_start = round_down(start, sectorsize); + range_end = round_up(start + len, sectorsize); + prev_extent_end = range_start; + + lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, range_start); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. + */ + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < range_end) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); + + /* + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. + */ + if (extent_end <= range_start) + goto next_item; + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 hole_end = min(key.offset, range_end) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + prev_extent_end, hole_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= range_end) { + stopped = true; + break; + } + } + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) + flags |= FIEMAP_EXTENT_ENCODED; + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, + disk_bytenr, extent_offset, + extent_gen, key.offset, + extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); + } + + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* emit_fiemap_extent() told us to stop. */ + stopped = true; + break; + } + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; + } + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; + } + cond_resched(); + } + +check_eof_delalloc: + if (!stopped && prev_extent_end < range_end) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, + 0, 0, 0, prev_extent_end, range_end - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = range_end; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_cached_state, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + +out_unlock: + unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { + btrfs_release_path(path); + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + len -= cache.next_search_offset - start; + start = cache.next_search_offset; + goto restart; + } else if (ret < 0) { + goto out; + } + + /* + * Must free the path before emitting to the fiemap buffer because we + * may have a non-cloned leaf and if the fiemap buffer is memory mapped + * to a file, a write into it (through btrfs_page_mkwrite()) may trigger + * waiting for an ordered extent that in order to complete needs to + * modify that leaf, therefore leading to a deadlock. + */ + btrfs_free_path(path); + path = NULL; + + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + + ret = emit_last_fiemap_cache(fieinfo, &cache); +out: + free_extent_state(delalloc_cached_state); + kfree(cache.entries); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); + return ret; +} + +int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + int ret; + + ret = fiemap_prep(inode, fieinfo, start, &len, 0); + if (ret) + return ret; + + /* + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); + if (ret) + return ret; + } + + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; +} diff --git a/fs/btrfs/fiemap.h b/fs/btrfs/fiemap.h new file mode 100644 index 000000000000..cfd74b35988f --- /dev/null +++ b/fs/btrfs/fiemap.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FIEMAP_H +#define BTRFS_FIEMAP_H + +#include <linux/fiemap.h> + +int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +#endif /* BTRFS_FIEMAP_H */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 45cae356e89b..886749b39672 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -10,17 +10,14 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" -#include "print-tree.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" -#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -48,18 +45,17 @@ */ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start, end, i_size; int ret; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); - if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + if (!inode->file_extent_tree) { inode->disk_i_size = i_size; goto out_unlock; } - ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); @@ -87,14 +83,15 @@ out_unlock: int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { + if (!inode->file_extent_tree) + return 0; + if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) - return 0; - return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -115,15 +112,16 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { + if (!inode->file_extent_tree) + return 0; + if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) - return 0; - return clear_extent_bit(&inode->file_extent_tree, start, + return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -153,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -179,7 +177,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -356,7 +353,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) + test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* @@ -434,8 +431,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; set_extent_bit(&inode->io_tree, file_offset, @@ -454,9 +450,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return ret; } +/* + * Search for checksums for a given logical range. + * + * @root: The root where to look for checksums. + * @start: Logical address of target checksum range. + * @end: End offset (inclusive) of the target checksum range. + * @list: List for adding each checksum that was found. + * Can be NULL in case the caller only wants to check if + * there any checksums for the range. + * @nowait: Indicate if the search must be non-blocking or not. + * + * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were + * found. + */ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) + struct list_head *list, bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -464,8 +473,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; - LIST_HEAD(tmplist); int ret; + bool found_csums = false; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -475,11 +484,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, return -ENOMEM; path->nowait = nowait; - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; - } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = start; @@ -487,7 +491,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto fail; + goto out; if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); @@ -522,7 +526,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto fail; + goto out; if (ret > 0) break; leaf = path->nodes[0]; @@ -544,6 +548,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, continue; } + found_csums = true; + if (!list) + goto out; + csum_end = min(csum_end, end + 1); item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); @@ -557,7 +565,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, GFP_NOFS); if (!sums) { ret = -ENOMEM; - goto fail; + goto out; } sums->logical = start; @@ -571,21 +579,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, bytes_to_csum_size(fs_info, size)); start += size; - list_add_tail(&sums->list, &tmplist); + list_add_tail(&sums->list, list); } path->slots[0]++; } - ret = 0; -fail: - while (ret < 0 && !list_empty(&tmplist)) { - sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); +out: + btrfs_free_path(path); + if (ret < 0) { + if (list) { + struct btrfs_ordered_sum *tmp_sums; + + list_for_each_entry_safe(sums, tmp_sums, list, list) + kfree(sums); + } + + return ret; } - list_splice_tail(&tmplist, list); - btrfs_free_path(path); - return ret; + return found_csums ? 1 : 0; } /* @@ -874,8 +885,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -1175,7 +1186,7 @@ extend_csum: * search, etc, because log trees are temporary anyway and it * would only save a few bytes of leaf space. */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { if (path->slots[0] + 1 >= btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); @@ -1229,8 +1240,6 @@ insert: ins_size); if (ret < 0) goto out; - if (WARN_ON(ret != 0)) - goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -1263,7 +1272,7 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1271,58 +1280,56 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; - u64 extent_start, extent_end; - u64 bytenr; + u64 extent_start; u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, fi); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; + em->len = btrfs_file_extent_end(path) - extent_start; + if (disk_bytenr == 0) { + em->disk_bytenr = EXTENT_MAP_HOLE; + em->disk_num_bytes = 0; + em->offset = 0; return; } + em->disk_bytenr = disk_bytenr; + em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = em->orig_block_len; + extent_map_set_compression(em, compress_type); } else { - bytenr += btrfs_file_extent_offset(leaf, fi); - em->block_start = bytenr; - em->block_len = em->len; + /* + * Older kernels can create regular non-hole data + * extents with ram_bytes smaller than disk_num_bytes. + * Not a big deal, just always use disk_num_bytes + * for ram_bytes. + */ + em->ram_bytes = em->disk_num_bytes; if (type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { - em->block_start = EXTENT_MAP_INLINE; - em->start = extent_start; - em->len = extent_end - extent_start; - /* - * Initialize orig_start and block_len with the same values - * as in inode.c:btrfs_get_extent(). - */ - em->orig_start = EXTENT_MAP_HOLE; - em->block_len = (u64)-1; - em->compress_type = compress_type; - if (compress_type != BTRFS_COMPRESS_NONE) - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + /* Tree-checker has ensured this. */ + ASSERT(extent_start == 0); + + em->disk_bytenr = EXTENT_MAP_INLINE; + em->start = 0; + em->len = fs_info->sectorsize; + em->offset = 0; + extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " "root %llu", type, btrfs_ino(inode), extent_start, - root->root_key.objectid); + btrfs_root_id(root)); } } @@ -1343,12 +1350,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path) ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { - end = btrfs_file_extent_ram_bytes(leaf, fi); - end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); - } else { + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) + end = leaf->fs_info->sectorsize; + else end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - } return end; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 04bd2d34efb1..0e13661a71f3 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,21 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> #include "accessors.h" +struct extent_map; +struct btrfs_file_extent_item; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_bio; +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_ordered_sum; +struct btrfs_path; +struct btrfs_inode; + #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) @@ -55,14 +68,13 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); + struct list_head *list, bool nowait); int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, u64 start, u64 end, u8 *csum_buf, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fc6c91773bc8..0e63603ac5c7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -18,13 +18,12 @@ #include <linux/iversion.h> #include <linux/fsverity.h> #include "ctree.h" +#include "direct-io.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "volumes.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" @@ -110,8 +109,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, - block_len); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), + block_start, block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -129,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int err = 0; + int ret = 0; int i; u64 num_bytes; u64 start_pos; @@ -159,17 +158,20 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); - if (err) - return err; + if (ret) + return ret; for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); + btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), + start_pos, num_bytes); } /* @@ -204,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -241,10 +242,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; - update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -371,14 +372,17 @@ next_slot: btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - new_key.objectid, - args->start - extent_offset, - 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, new_key.objectid, + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -461,14 +465,17 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key.objectid, - key.offset - extent_offset, 0, - false); + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key.objectid, + key.offset - extent_offset, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -744,10 +751,13 @@ again: extent_end - split); btrfs_mark_buffer_dirty(trans, leaf); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset, 0, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -770,10 +780,14 @@ again: other_start = end; other_end = 0; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, - 0, false); + + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -866,9 +880,9 @@ static int prepare_uptodate_page(struct inode *inode, * released. * * The private flag check is essential for subpage as we need - * to store extra bitmap using page->private. + * to store extra bitmap using folio private. */ - if (page->mapping != inode->i_mapping || !PagePrivate(page)) { + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { unlock_page(page); return -EAGAIN; } @@ -911,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = get_prepare_fgp_flags(nowait); - int err = 0; + int ret = 0; int faili; for (i = 0; i < num_pages; i++) { @@ -921,28 +935,28 @@ again: if (!pages[i]) { faili = i - 1; if (nowait) - err = -EAGAIN; + ret = -EAGAIN; else - err = -ENOMEM; + ret = -ENOMEM; goto fail; } - err = set_page_extent_mapped(pages[i]); - if (err < 0) { + ret = set_page_extent_mapped(pages[i]); + if (ret < 0) { faili = i; goto fail; } if (i == 0) - err = prepare_uptodate_page(inode, pages[i], pos, + ret = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); - if (!err && i == num_pages - 1) - err = prepare_uptodate_page(inode, pages[i], + if (!ret && i == num_pages - 1) + ret = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); - if (err) { + if (ret) { put_page(pages[i]); - if (!nowait && err == -EAGAIN) { - err = 0; + if (!nowait && ret == -EAGAIN) { + ret = 0; goto again; } faili = i - 1; @@ -958,7 +972,7 @@ fail: put_page(pages[faili]); faili--; } - return err; + return ret; } @@ -1090,7 +1104,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, nowait, false); + NULL, nowait, false); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1108,33 +1122,32 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) static void update_time_for_write(struct inode *inode) { - struct timespec64 now, ctime; + struct timespec64 now, ts; if (IS_NOCMTIME(inode)) return; now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_mtime_to_ts(inode, now); - ctime = inode_get_ctime(inode); - if (!timespec64_equal(&ctime, &now)) + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) inode_set_ctime_to_ts(inode, now); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); } -static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, - size_t count) +int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1158,9 +1171,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, */ update_time_for_write(inode); - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1172,13 +1184,12 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, return 0; } -static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, - struct iov_iter *i) +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; @@ -1189,7 +1200,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, ssize_t ret; bool only_release_metadata = false; bool force_page_uptodate = false; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1201,6 +1212,13 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; @@ -1436,202 +1454,6 @@ out: return num_written ? num_written : ret; } -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - const u32 blocksize_mask = fs_info->sectorsize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; - - return 0; -} - -static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - loff_t pos; - ssize_t written = 0; - ssize_t written_buffered; - size_t prev_left = 0; - loff_t endbyte; - ssize_t err; - unsigned int ilock_flags = 0; - struct iomap_dio *dio; - - if (iocb->ki_flags & IOCB_NOWAIT) - ilock_flags |= BTRFS_ILOCK_TRY; - - /* - * If the write DIO is within EOF, use a shared lock and also only if - * security bits will likely not be dropped by file_remove_privs() called - * from btrfs_write_check(). Either will need to be rechecked after the - * lock was acquired. - */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) - ilock_flags |= BTRFS_ILOCK_SHARED; - -relock: - err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (err < 0) - return err; - - /* Shared lock cannot be used with security bits set. */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - err = generic_write_checks(iocb, from); - if (err <= 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return err; - } - - err = btrfs_write_check(iocb, from, err); - if (err < 0) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto out; - } - - pos = iocb->ki_pos; - /* - * Re-check since file size may have changed just before taking the - * lock or pos may have changed because of O_APPEND in generic_write_check() - */ - if ((ilock_flags & BTRFS_ILOCK_SHARED) && - pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - ilock_flags &= ~BTRFS_ILOCK_SHARED; - goto relock; - } - - if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - goto buffered; - } - - /* - * The iov_iter can be mapped to the same file range we are writing to. - * If that's the case, then we will deadlock in the iomap code, because - * it first calls our callback btrfs_dio_iomap_begin(), which will create - * an ordered extent, and after that it will fault in the pages that the - * iov_iter refers to. During the fault in we end up in the readahead - * pages code (starting at btrfs_readahead()), which will lock the range, - * find that ordered extent and then wait for it to complete (at - * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since - * obviously the ordered extent can never complete as we didn't submit - * yet the respective bio(s). This always happens when the buffer is - * memory mapped to the same file range, since the iomap DIO code always - * invalidates pages in the target file range (after starting and waiting - * for any writeback). - * - * So here we disable page faults in the iov_iter and then retry if we - * got -EFAULT, faulting in the pages before the retry. - */ -again: - from->nofault = true; - dio = btrfs_dio_write(iocb, from, written); - from->nofault = false; - - if (IS_ERR_OR_NULL(dio)) { - err = PTR_ERR_OR_ZERO(dio); - } else { - /* - * If we have a synchoronous write, we must make sure the fsync - * triggered by the iomap_dio_complete() call below doesn't - * deadlock on the inode lock - we are already holding it and we - * can't call it after unlocking because we may need to complete - * partial writes due to the input buffer (or parts of it) not - * being already faulted in. - */ - ASSERT(current->journal_info == NULL); - current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; - err = iomap_dio_complete(dio); - current->journal_info = NULL; - } - - /* No increment (+=) because iomap returns a cumulative value. */ - if (err > 0) - written = err; - - if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { - const size_t left = iov_iter_count(from); - /* - * We have more data left to write. Try to fault in as many as - * possible of the remainder pages and retry. We do this without - * releasing and locking again the inode, to prevent races with - * truncate. - * - * Also, in case the iov refers to pages in the file range of the - * file we want to write to (due to a mmap), we could enter an - * infinite loop if we retry after faulting the pages in, since - * iomap will invalidate any pages in the range early on, before - * it tries to fault in the pages of the iov. So we keep track of - * how much was left of iov in the previous EFAULT and fallback - * to buffered IO in case we haven't made any progress. - */ - if (left == prev_left) { - err = -ENOTBLK; - } else { - fault_in_iov_iter_readable(from, left); - prev_left = left; - goto again; - } - } - - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - - /* - * If 'err' is -ENOTBLK or we have not written all data, then it means - * we must fallback to buffered IO. - */ - if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) - goto out; - -buffered: - /* - * If we are in a NOWAIT context, then return -EAGAIN to signal the caller - * it must retry the operation in a context where blocking is acceptable, - * because even if we end up not blocking during the buffered IO attempt - * below, we will block when flushing and waiting for the IO. - */ - if (iocb->ki_flags & IOCB_NOWAIT) { - err = -EAGAIN; - goto out; - } - - pos = iocb->ki_pos; - written_buffered = btrfs_buffered_write(iocb, from); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - /* - * Ensure all data is persisted. We want the next direct IO read to be - * able to read what was just written. - */ - endbyte = pos + written_buffered - 1; - err = btrfs_fdatawrite_range(inode, pos, endbyte); - if (err) - goto out; - err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); - if (err) - goto out; - written += written_buffered; - iocb->ki_pos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); -out: - return err < 0 ? err : written; -} - static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { @@ -1731,7 +1553,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } -static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) { int ret; struct blk_plug plug; @@ -1751,10 +1573,10 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) { - struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_inode *inode = ctx->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (btrfs_inode_in_log(inode, fs_info->generation) && + if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; @@ -1765,7 +1587,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ - if (inode->last_trans <= fs_info->last_trans_committed && + if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; @@ -1787,9 +1609,9 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; @@ -1800,7 +1622,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; - lockdep_assert_held(&inode->i_rwsem); + btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); @@ -1830,9 +1652,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; if (skip_ilock) - down_write(&BTRFS_I(inode)->i_mmap_lock); + down_write(&inode->i_mmap_lock); else - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -1857,9 +1679,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); if (ret) { if (skip_ilock) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); else - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1871,8 +1693,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures. */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We have to do this here to avoid the priority inversion of waiting on @@ -1891,15 +1712,29 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents. */ - btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), - &ctx.ordered_extents); - ret = filemap_fdatawait_range(inode->i_mapping, start, end); + btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) @@ -1907,15 +1742,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - smp_mb(); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ - clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not @@ -1923,10 +1756,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * for any errors that might have happened since we last * checked called fsync. */ - ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents; } + btrfs_init_log_ctx_scratch_eb(&ctx); + /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for @@ -1946,6 +1781,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + /* + * Scratch eb no longer needed, release before syncing log or commit + * transaction, to avoid holding unnecessary memory during such long + * operations. + */ + if (ctx.scratch_eb) { + free_extent_buffer(ctx.scratch_eb); + ctx.scratch_eb = NULL; + } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -1963,9 +1807,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * inside btrfs_sync_log to keep things safe. */ if (skip_ilock) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); else - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2024,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); @@ -2034,12 +1879,179 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); if (skip_ilock) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); else - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct folio *folio = page_folio(page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + size_t fsize = folio_size(folio); + vm_fault_t ret; + int ret2; + int reserved = 0; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + ASSERT(folio_order(folio) == 0); + + reserved_space = fsize; + + sb_start_pagefault(inode->i_sb); + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); + reserved = 1; + } + if (ret2) { + ret = vmf_error(ret2); + if (reserved) + goto out; + goto out_noreserve; + } + + /* Make the VM retry the fault. */ + ret = VM_FAULT_NOPAGE; +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + folio_lock(folio); + size = i_size_read(inode); + + if ((folio->mapping != inode->i_mapping) || + (page_start >= size)) { + /* Page got truncated out from underneath us. */ + goto out_unlock; + } + folio_wait_writeback(folio); + + lock_extent(io_tree, page_start, page_end, &cached_state); + ret2 = set_folio_extent_mapped(folio); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * We can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, fs_info->sectorsize); + if (reserved_space < fsize) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, end + 1, + fsize - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret2) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* Page is wholly or partially inside EOF. */ + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); + else + zero_start = fsize; + + if (zero_start != fsize) + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); + + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + folio_unlock(folio); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, (ret != 0)); +out_noreserve: + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return ret; +} + static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -2169,12 +2181,9 @@ out: hole_em->start = offset; hole_em->len = end - offset; hole_em->ram_bytes = hole_em->len; - hole_em->orig_start = offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; - hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -2198,14 +2207,14 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ - if (em->block_start == EXTENT_MAP_HOLE) { + if (em->disk_bytenr == EXTENT_MAP_HOLE) { ret = 1; *len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len; @@ -2227,15 +2236,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE); while (1) { truncate_pagecache_range(inode, lockstart, lockend); lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + break; /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2247,7 +2261,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * we do, unlock the range and retry. */ if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + page_lockend - 1)) break; unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -2270,7 +2284,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; int slot; - struct btrfs_ref ref = { 0 }; int ret; if (replace_len == 0) @@ -2326,14 +2339,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->qgroup_reserved, &key); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = extent_info->disk_offset, + .num_bytes = extent_info->disk_len, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; u64 ref_offset; - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - extent_info->disk_offset, - extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset, 0, false); + btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2500,9 +2516,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, inode_inc_iversion(&inode->vfs_inode); if (!extent_info || extent_info->update_times) - inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); + inode_set_mtime_to_ts(&inode->vfs_inode, + inode_set_ctime_current(&inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -2613,7 +2630,7 @@ out: static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; @@ -2631,7 +2648,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); - ret = btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); if (ret) goto out_only_mutex; @@ -2741,8 +2758,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -2761,14 +2778,14 @@ out_only_mutex: struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); - inode->i_mtime = now; + inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -2835,7 +2852,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; @@ -2855,13 +2872,13 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start == EXTENT_MAP_HOLE) + if (em->disk_bytenr == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; - else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; @@ -2886,7 +2903,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2901,8 +2918,7 @@ static int btrfs_zero_range(struct inode *inode, * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ - if (em->start <= alloc_start && - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { @@ -2924,26 +2940,25 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = em->block_start + em->len; + alloc_hint = extent_map_block_start(em) + em->len; } free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->flags & EXTENT_FLAG_PREALLOC) { free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } - if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { free_extent_map(em); ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 0); @@ -3026,7 +3041,7 @@ reserve_space: } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, - i_blocksize(inode), + fs_info->sectorsize, offset + len, &alloc_hint); unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); @@ -3070,7 +3085,7 @@ static long btrfs_fallocate(struct file *file, int mode, int ret; /* Do not allow fallocate in ZONED mode */ - if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); @@ -3128,7 +3143,7 @@ static long btrfs_fallocate(struct file *file, int mode, * the file range and, due to the previous locking we did, we know there * can't be more delalloc or ordered extents in the range. */ - ret = btrfs_wait_ordered_range(inode, alloc_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, alloc_end - alloc_start); if (ret) goto out; @@ -3147,7 +3162,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3156,9 +3171,9 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); @@ -3198,7 +3213,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, i_blocksize(inode), + range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even @@ -3759,8 +3774,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_CAN_ODIRECT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3768,97 +3782,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } -static int check_direct_read(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int ret; - int i, seg; - - ret = check_direct_IO(fs_info, iter, offset); - if (ret < 0) - return ret; - - if (!iter_is_iovec(iter)) - return 0; - - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - const struct iovec *iov1 = iter_iov(iter) + seg; - const struct iovec *iov2 = iter_iov(iter) + i; - - if (iov1->iov_base == iov2->iov_base) - return -EINVAL; - } - } - return 0; -} - -static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - size_t prev_left = 0; - ssize_t read = 0; - ssize_t ret; - - if (fsverity_active(inode)) - return 0; - - if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) - return 0; - - btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); -again: - /* - * This is similar to what we do for direct IO writes, see the comment - * at btrfs_direct_write(), but we also disable page faults in addition - * to disabling them only at the iov_iter level. This is because when - * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), - * which can still trigger page fault ins despite having set ->nofault - * to true of our 'to' iov_iter. - * - * The difference to direct IO writes is that we deadlock when trying - * to lock the extent range in the inode's tree during he page reads - * triggered by the fault in (while for writes it is due to waiting for - * our own ordered extent). This is because for direct IO reads, - * btrfs_dio_iomap_begin() returns with the extent range locked, which - * is only unlocked in the endio callback (end_bio_extent_readpage()). - */ - pagefault_disable(); - to->nofault = true; - ret = btrfs_dio_read(iocb, to, read); - to->nofault = false; - pagefault_enable(); - - /* No increment (+=) because iomap returns a cumulative value. */ - if (ret > 0) - read = ret; - - if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { - const size_t left = iov_iter_count(to); - - if (left == prev_left) { - /* - * We didn't make any progress since the last attempt, - * fallback to a buffered read for the remainder of the - * range. This is just to avoid any possibility of looping - * for too long. - */ - ret = read; - } else { - /* - * We made some progress since the last retry or this is - * the first time we are retrying. Fault in as many pages - * as possible and retry. - */ - fault_in_iov_iter_writeable(to, left); - prev_left = left; - goto again; - } - } - btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - return ret < 0 ? ret : read; -} - static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; @@ -3890,10 +3813,12 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) { + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; /* @@ -3910,10 +3835,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) * know better and pull this out at some point in the future, it is * right and you are wrong. */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) + ret = filemap_fdatawrite_range(mapping, start, end); return ret; } diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 82b34fbb295f..912254e653cf 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -3,6 +3,21 @@ #ifndef BTRFS_FILE_H #define BTRFS_FILE_H +#include <linux/types.h> + +struct file; +struct extent_state; +struct kiocb; +struct iov_iter; +struct page; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_drop_extents_args; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_path; +struct btrfs_replace_extent_info; +struct btrfs_trans_handle; + extern const struct file_operations btrfs_file_operations; int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); @@ -22,12 +37,14 @@ int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); +int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9a6ec9344c3e..f4bcb2530660 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -19,9 +19,7 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" #include "subpage.h" @@ -57,6 +55,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; @@ -79,7 +82,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; @@ -113,7 +115,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); + inode = btrfs_iget_path(location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -135,7 +137,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, spin_lock(&block_group->lock); if (block_group->inode) - inode = igrab(block_group->inode); + inode = igrab(&block_group->inode->vfs_inode); spin_unlock(&block_group->lock); if (inode) return inode; @@ -154,7 +156,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, } if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) - block_group->inode = igrab(inode); + block_group->inode = BTRFS_I(igrab(inode)); spin_unlock(&block_group->lock); return inode; @@ -354,7 +356,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); fail: if (locked) @@ -394,7 +396,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->fs_info = btrfs_sb(inode->i_sb); + io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; @@ -434,8 +436,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - btrfs_page_clear_checked(io_ctl->fs_info, - io_ctl->pages[i], + btrfs_folio_clear_checked(io_ctl->fs_info, + page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); @@ -540,7 +542,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -562,7 +564,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, @@ -1266,7 +1268,7 @@ static int flush_dirty_cache(struct inode *inode) { int ret; - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); @@ -1322,7 +1324,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1363,7 +1365,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, /* * Write out cached info to an inode. * - * @root: root the inode belongs to * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group @@ -1374,7 +1375,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, @@ -1482,7 +1483,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl->entries = entries; io_ctl->bitmaps = bitmaps; - ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); + ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); if (ret) goto out; @@ -1507,7 +1508,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1533,8 +1534,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, - block_group, &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, + &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", @@ -2618,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_block_group *block_group, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { @@ -4157,15 +4158,13 @@ out: int __init btrfs_free_space_init(void) { - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); + 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index bd80c7b2af96..9f1dbfdee8ca 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,7 +6,19 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/freezer.h> +#include "fs.h" + +struct inode; +struct page; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_trans_handle; +struct btrfs_trim_block_group; /* * This is the trim state of an extent or bitmap. @@ -121,8 +133,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, - u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a0d8160b5375..308abbf8855b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1104,11 +1104,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1138,8 +1148,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, @@ -1176,12 +1184,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); @@ -1189,8 +1201,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; + } node = rb_next(node); } @@ -1206,11 +1221,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: +out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); return ret; } @@ -1273,12 +1286,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } btrfs_global_root_delete(free_space_root); @@ -1299,11 +1318,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) } return btrfs_commit_transaction(trans); - -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1326,8 +1340,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { @@ -1336,8 +1353,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_next(node); } @@ -1348,10 +1368,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 6d5551d0ced8..e6c6d6f4f221 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,7 +6,13 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +#include <linux/bits.h> + struct btrfs_caching_control; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_block_group; +struct btrfs_trans_handle; /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d24d41f7811a..374843aca60d 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -4,14 +4,49 @@ #define BTRFS_FS_H #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/btrfs_tree.h> #include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/compiler.h> +#include <linux/math.h> +#include <linux/atomic.h> +#include <linux/percpu_counter.h> +#include <linux/completion.h> +#include <linux/lockdep.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwlock_types.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/wait_bit.h> +#include <linux/sched.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" -#include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +struct inode; +struct super_block; +struct kobject; +struct reloc_control; +struct crypto_shash; +struct ulist; +struct btrfs_device; +struct btrfs_block_group; +struct btrfs_root; +struct btrfs_fs_devices; +struct btrfs_transaction; +struct btrfs_delayed_root; +struct btrfs_balance_control; +struct btrfs_subpage_info; +struct btrfs_stripe_hash_table; +struct btrfs_space_info; + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -63,7 +98,9 @@ enum { /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, - BTRFS_FS_STATE_NO_CSUMS, + /* Checksum errors are ignored. */ + BTRFS_FS_STATE_NO_DATA_CSUMS, + BTRFS_FS_STATE_SKIP_META_CSUMS, /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, @@ -139,6 +176,12 @@ enum { */ BTRFS_FS_FEATURE_CHANGED, + /* + * Indicate that we have found a tree block which is only aligned to + * sectorsize, but not to nodesize. This should be rare nowadays. + */ + BTRFS_FS_UNALIGNED_TREE_BLOCK, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -152,38 +195,39 @@ enum { * Note: don't forget to add new options to btrfs_show_options() */ enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), + BTRFS_MOUNT_NODATASUM = (1ULL << 0), + BTRFS_MOUNT_NODATACOW = (1ULL << 1), + BTRFS_MOUNT_NOBARRIER = (1ULL << 2), + BTRFS_MOUNT_SSD = (1ULL << 3), + BTRFS_MOUNT_DEGRADED = (1ULL << 4), + BTRFS_MOUNT_COMPRESS = (1ULL << 5), + BTRFS_MOUNT_NOTREELOG = (1ULL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1ULL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1ULL << 8), + BTRFS_MOUNT_NOSSD = (1ULL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1ULL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1ULL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1ULL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1ULL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1ULL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1ULL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1ULL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1ULL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1ULL << 18), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1ULL << 19), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1ULL << 20), + BTRFS_MOUNT_FRAGMENT_DATA = (1ULL << 21), + BTRFS_MOUNT_FRAGMENT_METADATA = (1ULL << 22), + BTRFS_MOUNT_FREE_SPACE_TREE = (1ULL << 23), + BTRFS_MOUNT_NOLOGREPLAY = (1ULL << 24), + BTRFS_MOUNT_REF_VERIFY = (1ULL << 25), + BTRFS_MOUNT_DISCARD_ASYNC = (1ULL << 26), + BTRFS_MOUNT_IGNOREBADROOTS = (1ULL << 27), + BTRFS_MOUNT_IGNOREDATACSUMS = (1ULL << 28), + BTRFS_MOUNT_NODISCARD = (1ULL << 29), + BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), + BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), + BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), }; /* @@ -216,7 +260,8 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_DEBUG /* @@ -225,6 +270,7 @@ enum { */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else @@ -239,6 +285,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { @@ -371,6 +418,7 @@ struct btrfs_fs_info { struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; + struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -393,7 +441,8 @@ struct btrfs_fs_info { struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; + struct rb_root_cached mapping_tree; + rwlock_t mapping_tree_lock; /* * Block reservation for extent, checksum, root tree and delayed dir @@ -411,7 +460,17 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; + /* + * Updated while holding the lock 'trans_lock'. Due to the life cycle of + * a transaction, it can be directly read while holding a transaction + * handle, everywhere else must be read with btrfs_get_fs_generation(). + * Should always be updated using btrfs_set_fs_generation(). + */ u64 generation; + /* + * Always use btrfs_get_last_trans_committed() and + * btrfs_set_last_trans_committed() to read and update this field. + */ u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation @@ -425,7 +484,7 @@ struct btrfs_fs_info { * required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt; + unsigned long long mount_opt; unsigned long compress_type:4; unsigned int compress_level; @@ -576,6 +635,13 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + struct percpu_counter evictable_extent_maps; + spinlock_t extent_map_shrinker_lock; + u64 extent_map_shrinker_last_root; + u64 extent_map_shrinker_last_ino; + atomic64_t extent_map_shrinker_nr_to_scan; + struct work_struct extent_map_shrinker_work; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -642,14 +708,11 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif /* Is qgroup tracking in a consistent state? */ u64 qgroup_flags; @@ -685,6 +748,7 @@ struct btrfs_fs_info { /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + u64 qgroup_enable_gen; /* * If this is not 0, then it indicates a serious filesystem error has @@ -717,10 +781,13 @@ struct btrfs_fs_info { /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; + /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; + /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; + /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ @@ -814,6 +881,37 @@ struct btrfs_fs_info { #endif }; +#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ + struct page *: (_page))->mapping->host)) +#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ + struct folio *: (_folio))->mapping->host)) + +#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) +#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) + +#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ + struct inode *: (_inode)))->root->fs_info) + +static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->generation); +} + +static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->generation, gen); +} + +static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_trans_committed); +} + +static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->last_trans_committed, gen); +} + static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, u64 gen) { @@ -868,7 +966,7 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) /* * Count how many fs_info->max_extent_size cover the @size */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (!fs_info) @@ -887,6 +985,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -927,21 +1027,7 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { @@ -960,7 +1046,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) * since setting and checking for SB_RDONLY in the superblock's flags is not * atomic. */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || btrfs_fs_closing(fs_info); @@ -981,7 +1067,7 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); } @@ -992,7 +1078,7 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return 0; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index d3ff97374d48..29572dfaf878 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -9,13 +9,12 @@ #include "inode-item.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -43,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -142,8 +141,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); if (!extref) { - btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); - ret = -EROFS; + btrfs_abort_transaction(trans, -ENOENT); + ret = -ENOENT; goto out; } @@ -247,7 +246,7 @@ out: } /* - * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * Insert an extended inode ref into a tree. * * The caller must have checked against BTRFS_LINK_MAX already. */ @@ -424,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) @@ -671,15 +670,18 @@ delete: } if (del_item && extent_start != 0 && !control->skip_ref_updates) { - struct btrfs_ref ref = { 0 }; + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = extent_start, + .num_bytes = extent_num_bytes, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_header_owner(leaf), + }; bytes_deleted += extent_num_bytes; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - control->ino, extent_offset, - root->root_key.objectid, false); + btrfs_init_data_ref(&ref, control->ino, extent_offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index ede43b6c6559..c11b97fdccc4 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -4,14 +4,17 @@ #define BTRFS_INODE_ITEM_H #include <linux/types.h> +#include <linux/crc32c.h> +struct fscrypt_str; +struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; -struct extent_buffer; +struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the @@ -76,6 +79,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, *ro_flags = (u32)(inode_item_flags >> 32); } +/* Figure the key offset of an extended inode ref. */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) +{ + return (u64)crc32c(parent_objectid, name, len); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); @@ -100,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 035815c43949..f84e3f9fad84 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,21 +32,19 @@ #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" -#include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -71,31 +69,14 @@ #include "super.h" #include "orphan.h" #include "backref.h" +#include "raid-stripe-tree.h" +#include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; -struct btrfs_dio_data { - ssize_t submitted; - struct extent_changeset *data_reserved; - struct btrfs_ordered_extent *ordered; - bool data_space_reserved; - bool nocow_done; -}; - -struct btrfs_dio_private { - /* Range of I/O */ - u64 file_offset; - u32 bytes; - - /* This must be last */ - struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; - struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -113,6 +94,15 @@ struct data_reloc_warn { int mirror_num; }; +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -126,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) @@ -246,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, btrfs_ino(inode), file_off, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); @@ -256,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -323,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -339,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -348,7 +333,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } /* - * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * @@ -382,7 +367,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * btrfs_inode_unlock - unock inode i_rwsem + * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -408,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; - struct page *page; + struct folio *folio; - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; + if (locked_folio) { + page_start = folio_pos(locked_folio); + page_end = page_start + folio_size(locked_folio) - 1; } while (index <= end_index) { @@ -432,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { + if (locked_folio && index == (page_start >> PAGE_SHIFT)) { index++; continue; } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -446,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, - offset, bytes); - put_page(page); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } - if (locked_page) { + if (locked_folio) { /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) + if (bytes + offset <= page_start + folio_size(locked_folio)) return; /* * In case this page belongs to the delalloc range being @@ -462,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + bytes = offset + bytes - folio_pos(locked_folio) - + folio_size(locked_folio); + offset = folio_pos(locked_folio) + folio_size(locked_folio); } } @@ -504,12 +490,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -517,10 +503,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. Under + * heavy race, we can have size == 0 passed in, but that shouldn't be a + * big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); + + /* + * The compressed size also needs to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); - if (compressed_size && compressed_pages) + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -548,30 +547,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, + 0, 0, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -603,17 +595,62 @@ fail: return ret; } +static bool can_cow_file_range_inline(struct btrfs_inode *inode, + u64 offset, u64 size, + size_t compressed_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 data_len = (compressed_size ?: size); + + /* Inline extents must start at offset 0. */ + if (offset != 0) + return false; + + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (fs_info->sectorsize != PAGE_SIZE) + return false; + + /* Inline extents are limited to sectorsize. */ + if (size > fs_info->sectorsize) + return false; + + /* We cannot exceed the maximum inline data size. */ + if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return false; + + /* We cannot exceed the user specified max_inline size. */ + if (data_len > fs_info->max_inline) + return false; + + /* Inline extents must be the entirety of the file. */ + if (size < i_size_read(&inode->vfs_inode)) + return false; + + return true; +} /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. + * + * If being used directly, you must have already checked we're allowed to cow + * the range by getting true from can_cow_file_range_inline(). */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, - size_t compressed_size, - int compress_type, - struct page **compressed_pages, - bool update_i_size) +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, + u64 size, size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; @@ -623,18 +660,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, int ret; struct btrfs_path *path; - /* - * We can create an inline extent if it ends at or beyond the current - * i_size, is no larger than a sector (decompressed), and the (possibly - * compressed) data fits in a leaf and the configured maximum inline - * size. - */ - if (size < i_size_read(&inode->vfs_inode) || - size > fs_info->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - data_len > fs_info->max_inline) - return 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -660,7 +685,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -670,7 +695,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -693,19 +718,68 @@ out: return ret; } +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 offset, u64 end, + size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) +{ + struct extent_state *cached = NULL; + unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; + u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); + int ret; + + if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) + return 1; + + lock_extent(&inode->io_tree, offset, end, &cached); + ret = __cow_file_range_inline(inode, offset, size, compressed_size, + compress_type, compressed_folio, + update_i_size); + if (ret > 0) { + unlock_extent(&inode->io_tree, offset, end, &cached); + return ret; + } + + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ + if (ret == 0) + locked_folio = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + return ret; +} + struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -723,8 +797,8 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; @@ -735,8 +809,8 @@ static noinline int add_async_extent(struct async_chunk *cow, async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -801,7 +875,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) - return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -811,7 +885,27 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); +} + +static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long end_index = end >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + for (unsigned long index = start >> PAGE_SHIFT; + index <= end_index; index++) { + folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + if (IS_ERR(folio)) { + if (!ret) + ret = PTR_ERR(folio); + continue; + } + folio_clear_dirty_for_io(folio); + folio_put(folio); + } + return ret; } /* @@ -840,8 +934,8 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; @@ -855,7 +949,16 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* + * All the folios should have been locked thus no failure. + * + * And even if some folios are missing, btrfs_compress_folios() + * would handle them correctly, so here just do an ASSERT() check for + * early logic errors. + */ + ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between @@ -871,9 +974,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -922,8 +1025,8 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -937,9 +1040,9 @@ again: compress_type = inode->prop_compress; /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, - &total_compressed); + ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + mapping, start, folios, &nr_folios, &total_in, + &total_compressed); if (ret) goto mark_incompressible; @@ -949,7 +1052,7 @@ again: */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -960,43 +1063,16 @@ again: * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - if (total_in < actual_end) { - ret = cow_file_range_inline(inode, actual_end, 0, - BTRFS_COMPRESS_NONE, NULL, - false); - } else { - ret = cow_file_range_inline(inode, actual_end, - total_compressed, - compress_type, pages, - false); - } - if (ret <= 0) { - unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - - if (ret < 0) - mapping_set_error(mapping, -EIO); - - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - * - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be done _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - NULL, - clear_flags, - PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - goto free_pages; - } + if (total_in < actual_end) + ret = cow_file_range_inline(inode, NULL, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + else + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, + compress_type, folios[0], false); + if (ret <= 0) { + if (ret < 0) + mapping_set_error(mapping, -EIO); + goto free_pages; } /* @@ -1018,8 +1094,8 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1036,12 +1112,12 @@ cleanup_and_bail_uncompressed: BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1049,21 +1125,21 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - put_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1076,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); - - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, + btrfs_cleanup_ordered_extents(inode, locked_folio, + start, end - start + 1); + if (locked_folio) { + const u64 page_start = folio_pos(locked_folio); + + folio_start_writeback(locked_folio); + folio_end_writeback(locked_folio); + btrfs_mark_ordered_io_finished(inode, locked_folio, page_start, PAGE_SIZE, !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); + mapping_set_error(locked_folio->mapping, ret); + folio_unlock(locked_folio); } } } @@ -1103,10 +1181,13 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; + struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1114,20 +1195,23 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } - lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); + submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1142,34 +1226,30 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } + lock_extent(io_tree, start, end, &cached); + /* Here we're doing allocation and writeback of the compressed pages */ - em = create_io_em(inode, start, - async_extent->ram_size, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.ram_bytes = async_extent->ram_size; + file_extent.num_bytes = async_extent->ram_size; + file_extent.offset = 0; + file_extent.compression = async_extent->compress_type; + + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - async_extent->ram_size, /* ram_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* disk_num_bytes */ - 0, /* offset */ - 1 << BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1179,16 +1259,18 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; @@ -1197,7 +1279,8 @@ out_free_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + NULL, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | @@ -1207,13 +1290,13 @@ out_free_reserve: kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - root->root_key.objectid, btrfs_ino(inode), start, + btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, - u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1227,15 +1310,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; + if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + alloc_hint = extent_map_block_start(em); if (em) free_extent_map(em); } else { - alloc_hint = em->block_start; + alloc_hint = extent_map_block_start(em); free_extent_map(em); } } @@ -1250,21 +1333,21 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1273,12 +1356,13 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; @@ -1304,57 +1388,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { - u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), - end + 1); - + if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, actual_end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); - if (ret == 0) { - /* - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be run _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + if (ret <= 0) { /* - * locked_page is locked by the caller of - * writepage_delalloc(), not locked by - * __process_pages_contig(). + * We succeeded, return 1 so the caller knows we're done + * with this page and already handled the IO. * - * We can't let __process_pages_contig() to unlock it, - * as it doesn't have any subpage::writers recorded. - * - * Here we manually unlock the page, since the caller - * can't determine if it's an inline extent or a - * compressed extent. + * If there was an error then cow_file_range_inline() has + * already done the cleanup. */ - unlock_page(locked_page); - ret = 1; + if (ret == 0) + ret = 1; goto done; - } else if (ret < 0) { - goto out_unlock; } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); + + /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; /* * Relocation relies on the relocated extents to have exactly the same @@ -1374,6 +1437,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, @@ -1399,8 +1463,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1410,25 +1479,35 @@ static noinline int cow_file_range(struct btrfs_inode *inode, extent_reserved = true; ram_size = ins.offset; - em = create_io_em(inode, start, ins.offset, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - ram_size, /* ram_bytes */ - BTRFS_COMPRESS_NONE, /* compress_type */ - BTRFS_ORDERED_REGULAR /* type */); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + lock_extent(&inode->io_tree, start, start + ram_size - 1, + &cached); + + em = btrfs_create_io_em(inode, start, &file_extent, + BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, start, + start + ram_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, - ram_size, ins.objectid, cur_alloc_size, - 0, 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { + unlock_extent(&inode->io_tree, start, + start + ram_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1456,27 +1535,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); - if (num_bytes < cur_alloc_size) + if (num_bytes < ram_size) num_bytes = 0; else - num_bytes -= cur_alloc_size; + num_bytes -= ram_size; alloc_hint = ins.objectid + ins.offset; - start += cur_alloc_size; + start += ram_size; extent_reserved = false; /* @@ -1487,6 +1551,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; @@ -1502,34 +1568,36 @@ out_unlock: * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| - * `- orig_start `- start `- start + cur_alloc_size `- end + * `- orig_start `- start `- start + ram_size `- end * * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { - if (!locked_page) + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, @@ -1542,11 +1610,11 @@ out_unlock: */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size - 1, - locked_page, - clear_bits, + start + ram_size - 1, + locked_folio, &cached, clear_bits, page_ops); - start += cur_alloc_size; + btrfs_qgroup_free_data(inode, NULL, start, ram_size, NULL); + start += ram_size; } /* @@ -1557,8 +1625,9 @@ out_unlock: */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, locked_folio, + &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); } return ret; } @@ -1567,8 +1636,11 @@ out_unlock: * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. + * + * If called with @do_free == true then it'll try to finish the work and free + * the work struct eventually. */ -static noinline void submit_compressed_extents(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); @@ -1577,6 +1649,19 @@ static noinline void submit_compressed_extents(struct btrfs_work *work) unsigned long nr_pages; u64 alloc_hint = 0; + if (do_free) { + struct async_cow *async_cow; + + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); + return; + } + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1593,23 +1678,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); } -static noinline void async_cow_free(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - struct async_cow *async_cow; - - async_chunk = container_of(work, struct async_chunk, work); - btrfs_add_delayed_iput(async_chunk->inode); - if (async_chunk->blkcg_css) - css_put(async_chunk->blkcg_css); - - async_cow = async_chunk->async_cow; - if (atomic_dec_and_test(&async_cow->num_chunks)) - kvfree(async_cow); -} - static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1628,7 +1698,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, if (!ctx) return false; - unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; @@ -1650,15 +1719,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1668,12 +1738,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1685,7 +1755,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, } btrfs_init_work(&async_chunk[i].work, compress_file_range, - submit_compressed_extents, async_cow_free); + submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); @@ -1702,7 +1772,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1710,48 +1780,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, bool nowait) -{ - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); - struct btrfs_ordered_sum *sums; - int ret; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, - &list, 0, nowait); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - if (ret < 0) - return ret; - return 1; -} - -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; @@ -1788,6 +1837,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ + lock_extent(io_tree, start, end, &cached_state); count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { @@ -1806,13 +1856,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } + unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1832,13 +1884,11 @@ struct can_nocow_file_extent_args { */ bool free_path; - /* Output fields. Only set when can_nocow_file_extent() returns 1. */ - - u64 disk_bytenr; - u64 disk_num_bytes; - u64 extent_offset; - /* Number of bytes that can be written to in NOCOW mode. */ - u64 num_bytes; + /* + * Output fields. Only set when can_nocow_file_extent() returns 1. + * The expected file extent for the NOCOW write. + */ + struct btrfs_file_extent file_extent; }; /* @@ -1859,6 +1909,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; + struct btrfs_root *csum_root; + u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1871,11 +1923,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; - /* Can't access these fields unless we know it's not an inline extent. */ - args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - args->extent_offset = btrfs_file_extent_offset(leaf, fi); - if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; @@ -1891,7 +1938,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, goto out; /* An explicit hole, must COW. */ - if (args->disk_bytenr == 0) + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ @@ -1902,6 +1949,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, extent_end = btrfs_file_extent_end(path); + args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); + args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); + /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid @@ -1910,8 +1963,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, btrfs_release_path(path); ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->extent_offset, - args->disk_bytenr, args->strict, path); + key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1919,7 +1972,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (args->free_path) { /* * We don't need the path anymore, plus through the - * csum_exist_in_range() call below we will end up allocating + * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ @@ -1932,16 +1985,19 @@ static int can_nocow_file_extent(struct btrfs_path *path, atomic_read(&root->snapshot_force_cow)) goto out; - args->disk_bytenr += args->extent_offset; - args->disk_bytenr += args->start - key->offset; - args->num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.offset += args->start - key->offset; + io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, - nowait); + + csum_root = btrfs_csum_root(root->fs_info, io_start); + ret = btrfs_lookup_csums_list(csum_root, io_start, + io_start + args->file_extent.num_bytes - 1, + NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1955,6 +2011,110 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error handler, + * as metadata for created ordered extent will only be freed by + * btrfs_finish_ordered_io(). + */ + return ret; +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1962,13 +2122,18 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -1991,17 +2156,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.end = end; nocow_args.writeback_path = true; - while (1) { + while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct extent_state *cached_state = NULL; u64 extent_end; - u64 ram_bytes; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2056,12 +2218,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2077,7 +2240,6 @@ next_slot: ret = -EUCLEAN; goto error; } - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* @@ -2097,7 +2259,9 @@ next_slot: goto must_cow; ret = 0; - nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + nocow_bg = btrfs_inc_nocow_writers(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* @@ -2124,79 +2288,23 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } } - nocow_end = cur_offset + nocow_args.num_bytes - 1; - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - u64 orig_start = found_key.offset - nocow_args.extent_offset; - struct extent_map *em; - - em = create_io_em(inode, cur_offset, nocow_args.num_bytes, - orig_start, - nocow_args.disk_bytenr, /* block_start */ - nocow_args.num_bytes, /* block_len */ - nocow_args.disk_num_bytes, /* orig_block_len */ - ram_bytes, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, nocow_args.num_bytes, - nocow_args.disk_bytenr, nocow_args.num_bytes, 0, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW), - BTRFS_COMPRESS_NONE); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; - if (cur_offset > end) - break; } btrfs_release_path(path); @@ -2204,11 +2312,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); @@ -2216,19 +2325,59 @@ must_cow: error: /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_offset cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_offset) it's the same as case 1). + * But for range [cur_offset, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * + * Thus we should not call extent_clear_unlock_delalloc() on range + * [cur_offset, cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. + */ + if (cow_end) + cur_offset = cow_end + 1; + + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; - if (cur_offset < end) + if (cur_offset < end) { + struct extent_state *cached = NULL; + + lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DEFRAG | + locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + } btrfs_free_path(path); return ret; } @@ -2237,8 +2386,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, - 0, NULL)) + test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2249,40 +2397,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1); return ret; } @@ -2292,6 +2439,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; @@ -2330,6 +2479,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state u64 new_size, old_size; u32 num_extents; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; @@ -2377,55 +2528,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state spin_unlock(&inode->lock); } -static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct btrfs_inode *inode) +static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&inode->delalloc_inodes)) { - list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); - root->nr_delalloc_inodes++; - if (root->nr_delalloc_inodes == 1) { - spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(!list_empty(&root->delalloc_root)); - list_add_tail(&root->delalloc_root, - &fs_info->delalloc_roots); - spin_unlock(&fs_info->delalloc_root_lock); - } + ASSERT(list_empty(&inode->delalloc_inodes)); + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + ASSERT(list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) +void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + lockdep_assert_held(&root->delalloc_lock); + + /* + * We may be called after the inode was already deleted from the list, + * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), + * and then later through btrfs_clear_delalloc_extent() while the inode + * still has ->delalloc_bytes > 0. + */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(list_empty(&root->delalloc_root)); + ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) -{ - spin_lock(&root->delalloc_lock); - __btrfs_del_delalloc_inode(root, inode); - spin_unlock(&root->delalloc_lock); -} - /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. @@ -2435,6 +2581,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s { struct btrfs_fs_info *fs_info = inode->root->fs_info; + lockdep_assert_held(&inode->io_tree.lock); + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -2443,10 +2591,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; + u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); @@ -2459,13 +2606,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); + prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; - if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_clear_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) + btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && @@ -2487,6 +2641,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); + lockdep_assert_held(&inode->io_tree.lock); + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; @@ -2500,7 +2656,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; - bool do_list = !btrfs_is_free_space_inode(inode); + u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); @@ -2520,7 +2676,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, return; if (!btrfs_is_data_reloc_root(root) && - do_list && !(state->state & EXTENT_NORESERVE) && + !btrfs_is_free_space_inode(inode) && + !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2528,11 +2685,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; - if (do_list && inode->delalloc_bytes == 0 && - test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_del_delalloc_inode(root, inode); + new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_set_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { + spin_lock(&root->delalloc_lock); + btrfs_del_delalloc_inode(inode); + spin_unlock(&root->delalloc_lock); + } } if ((state->state & EXTENT_DELALLOC_NEW) && @@ -2546,44 +2712,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) -{ - u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_ordered_extent *new; - int ret; - - /* Must always be called for the beginning of an ordered extent. */ - if (WARN_ON_ONCE(start != ordered->disk_bytenr)) - return -EINVAL; - - /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) { - refcount_inc(&ordered->refs); - bbio->ordered = ordered; - return 0; - } - - /* - * Don't split the extent_map for NOCOW extents, as we're writing into - * a pre-existing one. - */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); - if (ret) - return ret; - } - - new = btrfs_split_ordered_extent(ordered, len); - if (IS_ERR(new)) - return PTR_ERR(new); - bbio->ordered = new; - return 0; -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2622,11 +2750,11 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start != EXTENT_MAP_HOLE) + if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; @@ -2676,7 +2804,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2688,50 +2816,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2739,7 +2868,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2748,14 +2877,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2773,7 +2902,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2787,14 +2916,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); - } - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); + } + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2807,33 +2936,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2843,14 +2973,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); - get_page(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); - fixup->page = page; + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -2982,10 +3112,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; - ram_bytes = num_bytes; - } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3001,7 +3129,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); - return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } @@ -3013,7 +3141,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { - struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; @@ -3072,9 +3200,14 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); + + ret = btrfs_insert_raid_extent(trans, ordered_extent); + if (ret) + btrfs_abort_transaction(trans, ret); + goto out; } @@ -3093,6 +3226,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; + ret = btrfs_insert_raid_extent(trans, ordered_extent); + if (ret) + goto out; + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -3113,8 +3250,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3138,12 +3280,11 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); @@ -3162,9 +3303,8 @@ out: * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ - if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, - &ordered_extent->flags)) - mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (ret) + btrfs_mark_ordered_extent_error(ordered_extent); if (truncated) unwritten_start += logical_len; @@ -3218,7 +3358,7 @@ out: * Actually free the qgroup rsv which was released when * the ordered extent was created. */ - btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } @@ -3240,8 +3380,9 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + if (btrfs_is_zoned(ordered->inode->root->fs_info) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } @@ -3299,7 +3440,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - 1, NULL)) { + NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); @@ -3323,7 +3464,7 @@ zeroit: } /* - * btrfs_add_delayed_iput - perform a delayed iput on @inode + * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * @@ -3533,7 +3674,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, last_objectid, root); + inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -3718,13 +3859,37 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 1; } +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (WARN_ON_ONCE(inode->file_extent_tree)) + return 0; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + if (!S_ISREG(inode->vfs_inode.i_mode)) + return 0; + if (btrfs_is_free_space_inode(inode)) + return 0; + + inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!inode->file_extent_tree) + return -ENOMEM; + + extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + + return 0; +} + /* * read an inode from the btree into the in-memory inode */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; @@ -3737,6 +3902,10 @@ static int btrfs_read_locked_inode(struct inode *inode, bool filled = false; int first_xattr_slot; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + return ret; + ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3747,7 +3916,7 @@ static int btrfs_read_locked_inode(struct inode *inode, return -ENOMEM; } - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3771,19 +3940,17 @@ static int btrfs_read_locked_inode(struct inode *inode, btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + btrfs_timespec_nsec(leaf, &inode_item->mtime)); inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_timespec_nsec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3795,7 +3962,9 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); @@ -3807,9 +3976,9 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == fs_info->generation) + if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -3886,7 +4055,7 @@ cache_acl: btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); } if (path != in_path) btrfs_free_path(path); @@ -3939,24 +4108,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, @@ -3974,19 +4141,20 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); + btrfs_get_inode_key(inode, &key); + ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4009,10 +4177,10 @@ failed: /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -4028,23 +4196,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, inode); + ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) + struct btrfs_inode *inode) { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); return ret; } @@ -4150,9 +4318,8 @@ err: inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode_set_ctime_current(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); - ret = btrfs_update_inode(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode(trans, dir); out: return ret; } @@ -4166,7 +4333,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, inode->root, inode); + ret = btrfs_update_inode(trans, inode); } return ret; } @@ -4250,9 +4417,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { - objectid = inode->root->root_key.objectid; + objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - objectid = inode->location.objectid; + objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); @@ -4305,7 +4472,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, - root->root_key.objectid, dir_ino, + btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4321,8 +4488,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); - ret = btrfs_update_inode_fallback(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4355,7 +4522,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { + if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", @@ -4365,7 +4532,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_release_path(path); } - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -4385,8 +4552,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) + if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: @@ -4398,70 +4564,32 @@ out: static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; + struct btrfs_inode *inode; + u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + if (atomic_read(&inode->vfs_inode.i_count) > 1) + d_prune_aliases(&inode->vfs_inode); - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(entry) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from the inode - * cache when its usage count hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); + min_ino = btrfs_ino(inode) + 1; + /* + * btrfs_drop_inode() will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(&inode->vfs_inode); + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); } - spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4482,7 +4610,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } @@ -4490,7 +4618,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } @@ -4519,11 +4647,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = PTR_ERR(trans); goto out_release; } - ret = btrfs_record_root_in_trans(trans, root); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; @@ -4551,7 +4674,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4559,8 +4682,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); + BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4569,7 +4691,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4610,9 +4732,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - int err = 0; + int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) @@ -4626,59 +4747,61 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); - if (err) - return err; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_notrans; } + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } - err = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!err) { + if (!ret) btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } out: btrfs_end_transaction(trans); out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); - return err; + return ret; } /* - * btrfs_truncate_block - read, zero a chunk and write a block + * Read, zero a chunk and write a block. + * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the @@ -4701,7 +4824,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); - struct page *page; + struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; @@ -4733,24 +4856,28 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } again: - page = find_or_create_page(mapping, index, mask); - if (!page) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } - if (!PageUptodate(page)) { - ret = btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } @@ -4759,22 +4886,22 @@ again: /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared - * PagePrivate(), but left the page in the mapping. Set the page mapped + * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -4795,15 +4922,17 @@ again: if (!len) len = blocksize - offset; if (front) - memzero_page(page, (block_start - page_offset(page)), - offset); + folio_zero_range(folio, block_start - folio_pos(folio), + offset); else - memzero_page(page, (block_start - page_offset(page)) + offset, - len); - } - btrfs_page_clear_checked(fs_info, page, block_start, - block_end + 1 - block_start); - btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + folio_zero_range(folio, + (block_start - folio_pos(folio)) + offset, + len); + } + btrfs_folio_clear_checked(fs_info, folio, block_start, + block_end + 1 - block_start); + btrfs_folio_set_dirty(fs_info, folio, block_start, + block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -4819,8 +4948,8 @@ out_unlock: block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); @@ -4828,9 +4957,9 @@ out: return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, - u64 offset, u64 len) +static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; @@ -4870,7 +4999,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; @@ -4894,16 +5023,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err = 0; + int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); - if (err) - return err; + ret = btrfs_truncate_block(inode, oldsize, 0, 0); + if (ret) + return ret; if (size <= hole_start) return 0; @@ -4912,10 +5041,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { - err = PTR_ERR(em); + ret = PTR_ERR(em); em = NULL; break; } @@ -4923,17 +5051,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, inode, cur_offset, - hole_size); - if (err) + ret = maybe_insert_hole(inode, cur_offset, hole_size); + if (ret) break; - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; hole_em = alloc_extent_map(); @@ -4946,21 +5073,18 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) } hole_em->start = cur_offset; hole_em->len = hole_size; - hole_em->orig_start = cur_offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; - hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = fs_info->generation; + hole_em->generation = btrfs_get_fs_generation(fs_info); - err = btrfs_replace_extent_map_range(inode, hole_em, true); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; } next: @@ -4972,7 +5096,7 @@ next: } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); - return err; + return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) @@ -4993,7 +5117,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } } @@ -5021,14 +5146,14 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { - ret = btrfs_wait_ordered_range(inode, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) @@ -5058,7 +5183,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5228,7 +5353,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, void btrfs_evict_inode(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; @@ -5242,11 +5367,12 @@ void btrfs_evict_inode(struct inode *inode) return; } + fs_info = inode_to_fs_info(inode); evict_inode_truncate_pages(inode); if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; @@ -5258,7 +5384,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } @@ -5430,7 +5556,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = dir->root->root_key.objectid; + key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5471,59 +5597,52 @@ out: return err; } -static void inode_tree_add(struct btrfs_inode *inode) +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) { struct btrfs_root *root = inode->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - struct rb_node *new = &inode->rb_node; - u64 ino = btrfs_ino(inode); + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; if (inode_unhashed(&inode->vfs_inode)) - return; - parent = NULL; - spin_lock(&root->inode_lock); - p = &root->inode_tree.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); + return 0; - if (ino < btrfs_ino(entry)) - p = &parent->rb_left; - else if (ino > btrfs_ino(entry)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_replace_node(parent, new, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - return; - } + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); } - rb_link_node(new, parent, p); - rb_insert_color(new, &root->inode_tree); - spin_unlock(&root->inode_lock); + + return 0; } -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; - int empty = 0; + struct btrfs_inode *entry; + bool empty = false; - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&inode->rb_node)) { - rb_erase(&inode->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&inode->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + if (entry == inode) + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } @@ -5534,12 +5653,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->location.objectid = args->ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); - BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) @@ -5552,12 +5667,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == BTRFS_I(inode)->location.objectid && + return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5566,7 +5680,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -5578,57 +5692,62 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, * allocator. NULL is also valid but may require an additional allocation * later. */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path) +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { struct inode *inode; + int ret; - inode = btrfs_iget_locked(s, ino, root); + inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int ret; + if (!(inode->i_state & I_NEW)) + return inode; - ret = btrfs_read_locked_inode(inode, path); - if (!ret) { - inode_tree_add(BTRFS_I(inode)); - unlock_new_inode(inode); - } else { - iget_failed(inode); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode, this means the inode item - * was not found. - */ - if (ret > 0) - ret = -ENOENT; - inode = ERR_PTR(ret); - } - } + ret = btrfs_read_locked_inode(inode, path); + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_read_locked_inode(), this means the inode item was not found. + */ + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto error; + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret < 0) + goto error; + + unlock_new_inode(inode); return inode; +error: + iget_failed(inode); + return ERR_PTR(ret); } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, ino, root, NULL); + return btrfs_iget_path(ino, root, NULL); } static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { + struct timespec64 ts; struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry @@ -5637,9 +5756,13 @@ static struct inode *new_simple_dir(struct inode *dir, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode_set_ctime_current(inode); - inode->i_atime = dir->i_atime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + + ts = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, ts); + inode_set_atime_to_ts(inode, inode_get_atime(dir)); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + inode->i_uid = dir->i_uid; inode->i_gid = dir->i_gid; @@ -5662,7 +5785,7 @@ static inline u8 btrfs_inode_type(struct inode *inode) struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; @@ -5678,7 +5801,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, location.objectid, root); + inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5702,7 +5825,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir, &location, root); } else { - inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) @@ -5922,7 +6045,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - put = btrfs_readdir_get_delayed_items(inode, private->last_index, + put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: @@ -6012,7 +6135,7 @@ nopos: ret = 0; err: if (put) - btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); + btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6037,15 +6160,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); - if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + ret = btrfs_update_inode(trans, inode); + if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) @@ -6061,7 +6184,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; - bool dirty = flags & ~S_VERSION; + bool dirty; if (btrfs_root_readonly(root)) return -EROFS; @@ -6097,7 +6220,7 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.ino = BTRFS_I(inode)->location.objectid; + args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6197,13 +6320,13 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { + struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; - struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; @@ -6212,6 +6335,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_item_batch batch; unsigned long ptr; int ret; + bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) @@ -6221,10 +6345,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; - inode->i_ino = objectid; + btrfs_set_inode_number(BTRFS_I(inode), objectid); + + ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); + if (ret) + goto out; + xa_reserved = true; if (args->orphan) { /* @@ -6239,12 +6372,21 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) goto out; } - /* index_cnt is ignored for everything but a dir. */ - BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* + * We don't have any capability xattrs set here yet, shortcut any + * queries for the xattrs here. If we add them later via the inode + * security init path or any other path this flag will be cleared. + */ + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. @@ -6260,11 +6402,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) @@ -6314,9 +6451,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = inode_set_ctime_current(inode); - inode->i_atime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + ts = simple_inode_init_ts(inode); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode @@ -6363,8 +6500,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ - parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - BTRFS_I(dir)->root); + parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { @@ -6377,8 +6513,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, - ret); + btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* @@ -6393,7 +6528,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(BTRFS_I(inode)); + ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); + if (WARN_ON(ret)) { + /* Shouldn't happen, we used xa_reserve() before. */ + btrfs_abort_transaction(trans, ret); + goto discard; + } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6421,6 +6561,9 @@ discard: ihold(inode); discard_new_inode(inode); out: + if (xa_reserved) + xa_release(&root->inodes, objectid); + btrfs_free_path(path); return ret; } @@ -6451,7 +6594,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, @@ -6481,10 +6624,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * values (the ones it had when the fsync was done). */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) - parent_inode->vfs_inode.i_mtime = - inode_set_ctime_current(&parent_inode->vfs_inode); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6494,7 +6637,7 @@ fail_dir_item: u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); @@ -6515,7 +6658,7 @@ fail_dir_item: static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, @@ -6585,14 +6728,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6635,7 +6778,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); + err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -6678,7 +6821,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6700,7 +6843,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6711,36 +6855,36 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) + struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6749,7 +6893,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6763,8 +6906,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6787,7 +6929,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6798,9 +6940,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto out; } em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { @@ -6890,9 +7031,8 @@ next: /* New extent overlaps with existing one */ em->start = start; - em->orig_start = start; em->len = found_key.offset - start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } @@ -6907,7 +7047,6 @@ next: * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -6917,19 +7056,18 @@ next: * * Other members are not utilized for inline extents. */ - ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(inode, path, folio); if (ret < 0) goto out; goto insert; } not_found: em->start = start; - em->orig_start = start; em->len = len; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); @@ -6942,7 +7080,7 @@ insert: } write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); @@ -6956,84 +7094,6 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, - const u64 block_start, - const u64 block_len, - const u64 orig_block_len, - const u64 ram_bytes, - const int type) -{ - struct extent_map *em = NULL; - struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, - block_len, orig_block_len, ram_bytes, - BTRFS_COMPRESS_NONE, /* compress_type */ - type); - if (IS_ERR(em)) - goto out; - } - ordered = btrfs_alloc_ordered_extent(inode, start, len, len, - block_start, block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } - em = ERR_CAST(ordered); - } else { - ASSERT(!dio_data->ordered); - dio_data->ordered = ordered; - } - out: - - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 len) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_map *em; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - - alloc_hint = get_extent_allocation_hint(inode, start, len); -again: - ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); - if (ret == -EAGAIN) { - ASSERT(btrfs_is_zoned(fs_info)); - wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - goto again; - } - if (ret) - return ERR_PTR(ret); - - em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); - - return em; -} - static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; @@ -7068,10 +7128,10 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict) + struct btrfs_file_extent *file_extent, + bool nowait, bool strict) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; @@ -7119,8 +7179,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); - if (ram_bytes) - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; @@ -7138,162 +7196,93 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } ret = 0; - if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) + if (btrfs_extent_readonly(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + nocow_args.num_bytes, + range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit(io_tree, offset, range_end, - EXTENT_DELALLOC, 0, NULL); + ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; } } - if (orig_start) - *orig_start = key.offset - nocow_args.extent_offset; - if (orig_block_len) - *orig_block_len = nocow_args.disk_num_bytes; + if (file_extent) + memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); - *len = nocow_args.num_bytes; + *len = nocow_args.file_extent.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, - unsigned int iomap_flags) -{ - const bool writing = (iomap_flags & IOMAP_WRITE); - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - int ret = 0; - - while (1) { - if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; - } else { - lock_extent(io_tree, lockstart, lockend, cached_state); - } - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure there's no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && - (!writing || !filemap_range_has_page(inode->i_mapping, - lockstart, lockend))) - break; - - unlock_extent(io_tree, lockstart, lockend, cached_state); - - if (ordered) { - if (nowait) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - break; - } - /* - * If we are doing a DIO read and the ordered extent we - * found is for a buffered write, we can not wait for it - * to complete and retry, because if we do so we can - * deadlock with concurrent buffered writes on page - * locks. This happens only if our DIO read covers more - * than one extent map, if at this point has already - * created an ordered extent for a previous extent map - * and locked its range in the inode's io tree, and a - * concurrent write against that previous extent map's - * range and this range started (we unlock the ranges - * in the io tree only when the bios complete and - * buffered writes always lock pages before attempting - * to lock range in the io tree). - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); - } else { - /* - * We could trigger writeback for this range (and wait - * for it to complete) and then invalidate the pages for - * this range (through invalidate_inode_pages2_range()), - * but that can lead us to a deadlock with a concurrent - * call to readahead (a buffered read or a defrag call - * triggered a readahead) on a page lock due to an - * ordered dio extent we created before but did not have - * yet a corresponding bio submitted (whence it can not - * complete), which makes readahead wait for that - * ordered extent to complete while holding a lock on - * that page. - */ - ret = nowait ? -EAGAIN : -ENOTBLK; - } - - if (ret) - break; - - cond_resched(); - } - - return ret; -} - /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type) { struct extent_map *em; int ret; + /* + * Note the missing NOCOW type. + * + * For pure NOCOW writes, we should not create an io extent map, but + * just reusing the existing one. + * Only PREALLOC writes (NOCOW write into preallocated range) can + * create an io extent map. + */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || - type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); + switch (type) { + case BTRFS_ORDERED_PREALLOC: + /* We're only referring part of a larger preallocated extent. */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + case BTRFS_ORDERED_REGULAR: + /* COW results a new extent matching our file extent size. */ + ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); + ASSERT(file_extent->ram_bytes == file_extent->num_bytes); + + /* Since it's a new extent, we should not have any offset. */ + ASSERT(file_extent->offset == 0); + break; + case BTRFS_ORDERED_COMPRESSED: + /* Must be compressed. */ + ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); + + /* + * Encoded write can make us to refer to part of the + * uncompressed extent. + */ + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); + break; + } + em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; - em->orig_start = orig_start; - em->len = len; - em->block_len = block_len; - em->block_start = block_start; - em->orig_block_len = orig_block_len; - em->ram_bytes = ram_bytes; + em->len = file_extent->num_bytes; + em->disk_bytenr = file_extent->disk_bytenr; + em->disk_num_bytes = file_extent->disk_num_bytes; + em->ram_bytes = file_extent->ram_bytes; em->generation = -1; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) { - set_bit(EXTENT_FLAG_FILLING, &em->flags); - } else if (type == BTRFS_ORDERED_COMPRESSED) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } + em->offset = file_extent->offset; + em->flags |= EXTENT_FLAG_PINNED; + if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7305,591 +7294,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, return em; } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 *lenp, - unsigned int iomap_flags) -{ - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *em = *map; - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - struct btrfs_block_group *bg; - bool can_nocow = false; - bool space_reserved = false; - u64 len = *lenp; - u64 prev_len; - int ret = 0; - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false, false) == 1) { - bg = btrfs_inc_nocow_writers(fs_info, block_start); - if (bg) - can_nocow = true; - } - } - - prev_len = len; - if (can_nocow) { - struct extent_map *em2; - - /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - nowait); - if (ret < 0) { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - btrfs_dec_nocow_writers(bg); - if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) - ret = -EAGAIN; - goto out; - } - space_reserved = true; - - em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(bg); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em2; - em = em2; - } - - if (IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - - dio_data->nocow_done = true; - } else { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - - if (nowait) { - ret = -EAGAIN; - goto out; - } - - /* - * If we could not allocate data space before locking the file - * range and we can't do a NOCOW write, then we have to fail. - */ - if (!dio_data->data_space_reserved) { - ret = -ENOSPC; - goto out; - } - - /* - * We have to COW and we have already reserved data space before, - * so now we reserve only metadata. - */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - false); - if (ret < 0) - goto out; - space_reserved = true; - - em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - *map = em; - len = min(len, em->len - (start - em->start)); - if (len < prev_len) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - prev_len - len, true); - } - - /* - * We have created our ordered extent, so we can now release our reservation - * for an outstanding extent. - */ - btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - - /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. - */ - if (start + len > i_size_read(inode)) - i_size_write(inode, start + len); -out: - if (ret && space_reserved) { - btrfs_delalloc_release_extents(BTRFS_I(inode), len); - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } - *lenp = len; - return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *em; - struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = iter->private; - u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - u64 len = length; - const u64 data_alloc_len = length; - bool unlock_extents = false; - - /* - * We could potentially fault if we have a buffer > PAGE_SIZE, and if - * we're NOWAIT we may submit a bio for a partial range and return - * EIOCBQUEUED, which would result in an errant short read. - * - * The best way to handle this would be to allow for partial completions - * of iocb's, so we could submit the partial bio, return and fault in - * the rest of the pages, and then submit the io for the rest of the - * range. However we don't have that currently, so simply return - * -EAGAIN at this point so that the normal path is used. - */ - if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) - return -EAGAIN; - - /* - * Cap the size of reads to that usually seen in buffered I/O as we need - * to allocate a contiguous array for the checksums. - */ - if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - - lockstart = start; - lockend = start + len - 1; - - /* - * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't - * enough if we've written compressed pages to this area, so we need to - * flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk - the first flush only starts - * compression on the data, while keeping the pages locked, so by the - * time the second flush returns we know bios for the compressed pages - * were submitted and finished, and the pages no longer under writeback. - * - * If we have a NOWAIT request and we have any pages in the range that - * are locked, likely due to compression still in progress, we don't want - * to block on page locks. We also don't want to block on pages marked as - * dirty or under writeback (same as for the non-compression case). - * iomap_dio_rw() did the same check, but after that and before we got - * here, mmap'ed writes may have happened or buffered reads started - * (readpage() and readahead(), which lock pages), as we haven't locked - * the file range yet. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - if (flags & IOMAP_NOWAIT) { - if (filemap_range_needs_writeback(inode->i_mapping, - lockstart, lockend)) - return -EAGAIN; - } else { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; - } - } - - memset(dio_data, 0, sizeof(*dio_data)); - - /* - * We always try to allocate data space and must do it before locking - * the file range, to avoid deadlocks with concurrent writes to the same - * range if the range has several extents and the writes don't expand the - * current i_size (the inode lock is taken in shared mode). If we fail to - * allocate data space here we continue and later, after locking the - * file range, we fail with ENOSPC only if we figure out we can not do a - * NOCOW write. - */ - if (write && !(flags & IOMAP_NOWAIT)) { - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, data_alloc_len, false); - if (!ret) - dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) - goto err; - } - - /* - * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered IO, or we are doing a - * NOWAIT read/write and we need to block. - */ - ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); - if (ret < 0) - goto err; - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because that's what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - /* - * If we are in a NOWAIT context, return -EAGAIN in order to - * fallback to buffered IO. This is not only because we can - * block with buffered IO (no support for NOWAIT semantics at - * the moment) but also to avoid returning short reads to user - * space - this happens if we were able to read some data from - * previous non-compressed extents and then when we fallback to - * buffered IO, at btrfs_file_read_iter() by calling - * filemap_read(), we fail to fault in pages for the read buffer, - * in which case filemap_read() returns a short read (the number - * of bytes previously read is > 0, so it does not return -EFAULT). - */ - ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; - goto unlock_err; - } - - len = min(len, em->len - (start - em->start)); - - /* - * If we have a NOWAIT request and the range contains multiple extents - * (or a mix of extents and holes), then we return -EAGAIN to make the - * caller fallback to a context where it can do a blocking (without - * NOWAIT) request. This way we avoid doing partial IO and returning - * success to the caller, which is not optimal for writes and for reads - * it can result in unexpected behaviour for an application. - * - * When doing a read, because we use IOMAP_DIO_PARTIAL when calling - * iomap_dio_rw(), we can end up returning less data then what the caller - * asked for, resulting in an unexpected, and incorrect, short read. - * That is, the caller asked to read N bytes and we return less than that, - * which is wrong unless we are crossing EOF. This happens if we get a - * page fault error when trying to fault in pages for the buffer that is - * associated to the struct iov_iter passed to iomap_dio_rw(), and we - * have previously submitted bios for other extents in the range, in - * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of - * those bios have completed by the time we get the page fault error, - * which we return back to our caller - we should only return EIOCBQUEUED - * after we have submitted bios for all the extents in the range. - */ - if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); - ret = -EAGAIN; - goto unlock_err; - } - - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, &len, flags); - if (ret < 0) - goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); - if (dio_data->data_space_reserved) { - u64 release_offset; - u64 release_len = 0; - - if (dio_data->nocow_done) { - release_offset = start; - release_len = data_alloc_len; - } else if (len < data_alloc_len) { - release_offset = start + len; - release_len = data_alloc_len - len; - } - - if (release_len > 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - release_offset, - release_len); - } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; - } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; - free_extent_map(em); - - return 0; - -unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); -err: - if (dio_data->data_space_reserved) { - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - start, data_alloc_len); - extent_changeset_free(dio_data->data_reserved); - } - - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned int flags, struct iomap *iomap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_dio_data *dio_data = iter->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); - return 0; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); - ret = -ENOTBLK; - } - if (write) { - btrfs_put_ordered_extent(dio_data->ordered); - dio_data->ordered = NULL; - } - - if (write) - extent_changeset_free(dio_data->data_reserved); - return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; - - if (bio->bi_status) { - btrfs_warn(inode->root->fs_info, - "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", - btrfs_ino(inode), bio->bi_opf, - dip->file_offset, dip->bytes, bio->bi_status); - } - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); - } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - bbio->bio.bi_private = bbio->private; - iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset) -{ - struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; - - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, - btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; - - dip->file_offset = file_offset; - dip->bytes = bio->bi_iter.bi_size; - - dio_data->submitted += bio->bi_iter.bi_size; - - /* - * Check if we are doing a partial write. If we are, we need to split - * the ordered extent to match the submitted bio. Hang on to the - * remaining unfinishable ordered_extent in dio_data so that it can be - * cancelled in iomap_end to avoid a deadlock wherein faulting the - * remaining pages is blocked on the outstanding ordered extent. - */ - if (iter->flags & IOMAP_WRITE) { - int ret; - - ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); - if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - file_offset, dip->bytes, - !ret); - bio->bi_status = errno_to_blk_status(ret); - iomap_dio_bio_end_io(bio); - return; - } - } - - btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - int ret; - - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - /* - * fiemap_prep() called filemap_write_and_wait() for the whole possible - * file range (0 to LLONG_MAX), but that is not enough if we have - * compression enabled. The first filemap_fdatawrite_range() only kicks - * in the compression of data (in an async thread) and will return - * before the compression is done and writeback is started. A second - * filemap_fdatawrite_range() is needed to wait for the compression to - * complete and writeback to start. We also need to wait for ordered - * extents to complete, because our fiemap implementation uses mainly - * file extent items to list the extents, searching for extent maps - * only for file ranges with holes or prealloc extents to figure out - * if we have delalloc in those ranges. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) - return ret; - } - - btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - - /* - * We did an initial flush to avoid holding the inode's lock while - * triggering writeback and waiting for the completion of IO and ordered - * extents. Now after we locked the inode we do it again, because it's - * possible a new write may have happened in between those two steps. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) { - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - return ret; - } - } - - ret = extent_fiemap(btrfs_inode, fieinfo, start, len); - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - - return ret; -} - -static int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return extent_writepages(mapping, wbc); -} - -static void btrfs_readahead(struct readahead_control *rac) -{ - extent_readahead(rac); -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -7897,16 +7301,16 @@ static void btrfs_readahead(struct readahead_control *rac) * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7923,15 +7327,20 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +static int btrfs_launder_folio(struct folio *folio) { - int ret = try_release_extent_mapping(&folio->page, gfp_flags); + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + PAGE_SIZE, NULL); +} - if (ret == 1) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) +{ + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); + return true; } - return ret; + return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7965,7 +7374,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; @@ -7988,7 +7397,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -8044,7 +7453,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8053,7 +7462,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, */ goto next; } - btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8069,11 +7478,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -8123,174 +7532,10 @@ next: * did something wrong. */ ASSERT(!folio_test_ordered(folio)); - btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); -} - -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * truncate_setsize() writes the inode size before removing pages, once we have - * the page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_changeset *data_reserved = NULL; - unsigned long zero_start; - loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; - u64 reserved_space; - u64 page_start; - u64 page_end; - u64 end; - - reserved_space = PAGE_SIZE; - - sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - end = page_end; - - /* - * Reserving delalloc space after obtaining the page lock can lead to - * deadlock. For example, if a dirty page is locked by this function - * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepages() function could - * end up waiting indefinitely to get a lock on the page currently - * being processed by btrfs_page_mkwrite() function. - */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); - size = i_size_read(inode); - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); - goto out_unlock; - } - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } - - if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, - fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { - end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); - } - } - - /* - * page_mkwrite gets called when the page is firstly dirtied after it's - * faulted in, but write(2) could also dirty a page and set delalloc - * bits, thus in this case for space account reason, we still need to - * clear any delalloc bits within this page range since we have to - * reserve data&meta space before lock_page() (see above comments). - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); - else - zero_start = PAGE_SIZE; - - if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); - - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); - btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); - - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); - - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - -out_unlock: - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8310,7 +7555,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(&inode->vfs_inode, + ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) @@ -8403,7 +7648,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -8456,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; @@ -8527,8 +7772,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; + /* + * ->index_cnt will be properly initialized later when creating a new + * inode (btrfs_create_new_inode()) or when reading an existing inode + * from disk (btrfs_read_locked_inode()). + */ ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; @@ -8545,20 +7794,24 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; - ei->i_otime.tv_sec = 0; - ei->i_otime.tv_nsec = 0; + ei->i_otime_sec = 0; + ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); + + /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); + + ei->file_extent_tree = NULL; + mutex_init(&ei->log_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); + spin_lock_init(&ei->ordered_tree_lock); + ei->ordered_tree = RB_ROOT; + ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); - RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; @@ -8568,12 +7821,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -8592,9 +7847,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); } - WARN_ON(inode->csum_bytes); - WARN_ON(inode->defrag_bytes); + if (!root || !btrfs_is_data_reloc_root(root)) + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8628,7 +7884,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } } btrfs_qgroup_check_reserved_leak(inode); - inode_tree_del(inode); + btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); @@ -8662,7 +7918,6 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } @@ -8670,20 +7925,12 @@ int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; + return -ENOMEM; return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, @@ -8698,8 +7945,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; - stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) @@ -8719,6 +7966,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; + stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->result_mask |= STATX_SUBVOL; + spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); @@ -8733,7 +7983,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -8749,6 +7999,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8872,6 +8123,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8887,7 +8163,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8902,7 +8178,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8929,30 +8205,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8985,7 +8254,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, @@ -9002,6 +8271,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9136,6 +8406,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -9147,7 +8440,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9184,7 +8477,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -9200,6 +8493,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9272,7 +8569,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } @@ -9427,7 +8724,7 @@ out: static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; @@ -9598,7 +8895,7 @@ free_qgroup: * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, qgroup_released, + btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } @@ -9608,7 +8905,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -9673,13 +8970,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } em->start = cur_offset; - em->orig_start = cur_offset; em->len = ins.offset; - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; + em->disk_bytenr = ins.objectid; + em->offset = 0; + em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); @@ -9703,7 +8999,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9760,7 +9056,7 @@ static int btrfs_permission(struct mnt_idmap *idmap, static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; @@ -9823,17 +9119,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - btrfs_page_set_writeback(fs_info, page, start, len); - put_page(page); + /* This is for data, which doesn't yet support larger folio. */ + ASSERT(folio_order(folio) == 0); + btrfs_folio_set_writeback(fs_info, folio, start, len); + folio_put(folio); index++; } } @@ -9970,26 +9268,31 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) + if (atomic_dec_and_test(&priv->pending)) wake_up(&priv->wait); bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; + + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; - init_waitqueue_head(&priv.wait); + init_waitqueue_head(&priv->wait); + atomic_set(&priv->pending, 1); + priv->status = 0; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9997,11 +9300,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + atomic_inc(&priv->pending); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -10012,22 +9315,22 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + atomic_inc(&priv->pending); + btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); + if (atomic_dec_return(&priv->pending)) + io_wait_event(priv->wait, !atomic_read(&priv->pending)); /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -10041,13 +9344,13 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages); + ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_io_size, pages); if (ret) goto out; @@ -10088,15 +9391,16 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; bool unlocked = false; @@ -10118,27 +9422,27 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + ret = btrfs_wait_ordered_range(inode, start, lockend - start + 1); if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* @@ -10148,7 +9452,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); goto out; } @@ -10159,61 +9463,58 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - disk_bytenr = EXTENT_MAP_HOLE; + if (em->disk_bytenr == EXTENT_MAP_HOLE || + (em->flags & EXTENT_FLAG_PREALLOC)) { + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - disk_bytenr = em->block_start; + } else if (extent_map_is_compressed(em)) { + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ - if (em->block_len > count) { + if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } - disk_io_size = em->block_len; - count = em->block_len; + *disk_io_size = em->disk_num_bytes; + count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; - encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - em->compress_type); + extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - disk_bytenr = em->block_start + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_em; } out: @@ -10222,10 +9523,11 @@ out: out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -10240,12 +9542,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10334,24 +9637,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10361,14 +9664,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; + goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -10396,10 +9699,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_qgroup_free_data; /* Try an inline extent first. */ - if (start == 0 && encoded->unencoded_len == encoded->len && - encoded->unencoded_offset == 0) { - ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + if (encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0 && + can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { + ret = __cow_file_range_inline(inode, start, encoded->len, + orig_count, compression, folios[0], + true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10413,22 +9718,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_delalloc_release; extent_reserved = true; - em = create_io_em(inode, start, num_bytes, - start - encoded->unencoded_offset, ins.objectid, - ins.offset, ins.offset, ram_bytes, compression, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = num_bytes; + file_extent.ram_bytes = ram_bytes; + file_extent.offset = encoded->unencoded_offset; + file_extent.compression = compression; + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, - ins.objectid, ins.offset, - encoded->unencoded_offset, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED), - compression); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -10443,7 +9748,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; @@ -10465,12 +9770,12 @@ out_free_data_space: btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10617,38 +9922,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; + struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -10663,7 +9989,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -10677,7 +10004,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -10697,8 +10025,9 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", - root->root_key.objectid); - return -EPERM; + btrfs_root_id(root)); + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10706,24 +10035,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->block_start == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->block_start == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -10735,36 +10079,58 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = em->block_start + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; goto out; } - em = btrfs_get_chunk_map(fs_info, logical_block_start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(map)) { + ret = PTR_ERR(map); goto out; } - if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; @@ -10772,23 +10138,22 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } if (device == NULL) { - device = em->map_lookup->stripes[0].dev; + device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; - } else if (device != em->map_lookup->stripes[0].dev) { + } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } - physical_block_start = (em->map_lookup->stripes[0].physical + - (logical_block_start - em->start)); - len = min(len, em->len - (logical_block_start - em->start)); - free_extent_map(em); - em = NULL; + physical_block_start = (map->stripes[0].physical + + (logical_block_start - map->start)); + btrfs_free_chunk_map(map); + map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { @@ -10827,20 +10192,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); + if (!IS_ERR_OR_NULL(map)) + btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); @@ -10851,6 +10221,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10921,7 +10295,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", - start, end, btrfs_ino(inode), root->root_key.objectid, + start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); @@ -10930,6 +10304,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en ASSERT(ordered == NULL); } +/* + * Find the first inode with a minimum number. + * + * @root: The root to search for. + * @min_ino: The minimum inode number. + * + * Find the first inode in the @root with a number >= @min_ino and return it. + * Returns NULL if no such inode found. + */ +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + if (igrab(&inode->vfs_inode)) + break; + + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10982,10 +10386,11 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ae6806bc3929..1706f6d9b12e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,11 +34,9 @@ #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +45,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -247,7 +257,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; @@ -365,15 +375,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, - strlen(comp), 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, - 0, 0); + ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -385,7 +395,7 @@ update_flags: btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -540,7 +550,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return ret; } -int __pure btrfs_is_empty_uuid(u8 *uuid) +int __pure btrfs_is_empty_uuid(const u8 *uuid) { int i; @@ -582,7 +592,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -646,22 +656,17 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = PTR_ERR(trans); goto out_release_rsv; } - ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); - if (ret) - goto out; btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - /* Tree log can't currently deal with an inode which is a new root. */ - btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit); if (ret) goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto out; @@ -753,6 +758,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); @@ -786,7 +793,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; @@ -846,7 +853,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - pending_snapshot->dir = dir; + pending_snapshot->dir = BTRFS_I(dir); pending_snapshot->inherit = inherit; trans = btrfs_start_transaction(root, 0); @@ -931,7 +938,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -983,7 +992,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; @@ -1060,7 +1069,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, atomic_inc(&root->snapshot_force_cow); snapshot_force_cow = true; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); @@ -1118,7 +1127,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; @@ -1149,7 +1158,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1298,12 +1310,12 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } else { struct fd src = fdget(fd); struct inode *src_inode; - if (!src.file) { + if (!fd_file(src)) { ret = -EINVAL; goto out_drop_write; } - src_inode = file_inode(src.file); + src_inode = file_inode(fd_file(src)); if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); @@ -1349,12 +1361,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1373,7 +1388,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -1383,7 +1400,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - u64 nums; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { @@ -1396,19 +1413,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (inherit->num_qgroups > PAGE_SIZE || - inherit->num_ref_copies > PAGE_SIZE || - inherit->num_excl_copies > PAGE_SIZE) { - ret = -EINVAL; - goto free_inherit; - } - - nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + - 2 * inherit->num_excl_copies; - if (vol_args->size != struct_size(inherit, qgroups, nums)) { - ret = -EINVAL; + ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); + if (ret < 0) goto free_inherit; - } } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), @@ -1426,7 +1433,7 @@ free_args: static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1449,7 +1456,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1502,7 +1509,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_drop_sem; } @@ -1696,7 +1703,7 @@ static noinline int search_ioctl(struct inode *inode, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; @@ -1909,9 +1916,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct super_block *sb = inode->i_sb; - struct btrfs_key upper_limit = BTRFS_I(inode)->location; - u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 upper_limit = btrfs_ino(BTRFS_I(inode)); + u64 treeid = btrfs_root_id(BTRFS_I(inode)->root); u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; @@ -1936,7 +1942,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * If the bottom subvolume does not exist directly under upper_limit, * construct the path in from the bottom up. */ - if (dirid != upper_limit.objectid) { + if (dirid != upper_limit) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); @@ -1998,7 +2004,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(sb, key2.objectid, root); + temp_inode = btrfs_iget(key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; @@ -2011,7 +2017,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - if (key.offset == upper_limit.objectid) + if (key.offset == upper_limit) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; @@ -2083,7 +2089,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = root->root_key.objectid; + args->treeid = btrfs_root_id(root); if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2132,7 +2138,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) inode = file_inode(file); if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && - BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * The subvolume does not exist under fd with which this is * called @@ -2179,7 +2185,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -2294,7 +2300,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, return PTR_ERR(rootrefs); } - objectid = root->root_key.objectid; + objectid = btrfs_root_id(root); key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; @@ -2367,9 +2373,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2378,7 +2384,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; - int err = 0; + int ret = 0; bool destroy_parent = false; /* We don't support snapshots with extent tree v2 yet. */ @@ -2394,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -2403,29 +2409,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (ret < 0) + goto out; subvol_name = vol_args2->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out; } - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_drop_write; } @@ -2445,7 +2453,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ dput(dentry); if (IS_ERR(parent)) { - err = PTR_ERR(parent); + ret = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; @@ -2469,14 +2477,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * to delete without an idmapped mount. */ if (old_dir != dir && idmap != &nop_mnt_idmap) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { - err = PTR_ERR(subvol_name_ptr); + ret = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ @@ -2487,11 +2495,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; + subvol_name = vol_args->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } @@ -2499,26 +2510,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { - err = -EINVAL; + ret = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { - err = -ENOTDIR; + ret = -ENOTDIR; goto free_subvol_name; } - err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (err == -EINTR) + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) goto free_subvol_name; dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { - err = -ENOENT; + ret = -ENOENT; goto out_dput; } @@ -2538,7 +2549,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * Users who want to delete empty subvols should try * rmdir(2). */ - err = -EPERM; + ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; @@ -2549,29 +2560,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * of the subvol, not a random directory contained * within it. */ - err = -EINVAL; + ret = -EINVAL; if (root == dest) goto out_dput; - err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); - if (err) + ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); + if (ret) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(idmap, dir, dentry, 1); - if (err) + ret = btrfs_may_delete(idmap, dir, dentry, 1); + if (ret) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out_dput; } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); - if (!err) + if (!ret) d_delete_notify(dir, dentry); out_dput: @@ -2588,7 +2599,7 @@ out_drop_write: out: kfree(vol_args2); kfree(vol_args); - return err; + return ret; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) @@ -2672,6 +2683,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) return -EINVAL; } + if (fs_info->fs_devices->temp_fsid) { + btrfs_err(fs_info, + "device add not supported on cloned temp-fsid mount"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2692,12 +2709,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2711,10 +2732,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2730,7 +2750,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2751,7 +2774,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2765,8 +2788,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2777,10 +2800,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2791,7 +2813,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2807,17 +2832,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } @@ -2859,7 +2885,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { - fi_args->generation = fs_info->generation; + fi_args->generation = btrfs_get_fs_generation(fs_info); fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } @@ -2921,7 +2947,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -2953,7 +2979,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->root_key.objectid)) { + if (!is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -3168,7 +3194,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don't bother */ - transid = root->fs_info->last_trans_committed; + transid = btrfs_get_last_trans_committed(root->fs_info); goto out; } transid = trans->transid; @@ -3195,7 +3221,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3713,7 +3739,7 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; @@ -3730,14 +3756,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - down_write(&fs_info->subvol_sem); - switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(fs_info); + case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + down_write(&fs_info->subvol_sem); + ret = btrfs_quota_enable(fs_info, sa); + up_write(&fs_info->subvol_sem); break; case BTRFS_QUOTA_CTL_DISABLE: + /* + * Lock the cleaner mutex to prevent races with concurrent + * relocation, because relocation may be building backrefs for + * blocks of the quota root while we are deleting the root. This + * is like dropping fs roots of deleted snapshots/subvolumes, we + * need the same protection. + * + * This also prevents races between concurrent tasks trying to + * disable quotas, because we will unlock and relock + * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * + * We take this here because we have the dependency of + * + * inode_lock -> subvol_sem + * + * because of rename. With relocation we can prealloc extents, + * so that makes the dependency chain + * + * cleaner_mutex -> inode_lock -> subvol_sem + * + * so we must take the cleaner_mutex here before we take the + * subvol_sem. The deadlock can't actually happen, but this + * quiets lockdep. + */ + mutex_lock(&fs_info->cleaner_mutex); + down_write(&fs_info->subvol_sem); ret = btrfs_quota_disable(fs_info); + up_write(&fs_info->subvol_sem); + mutex_unlock(&fs_info->cleaner_mutex); break; default: ret = -EINVAL; @@ -3745,18 +3800,34 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) } kfree(sa); - up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; } +/* + * Quick check for ioctl handlers if quotas are enabled. Proper locking must be + * done before any operations. + */ +static bool qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + bool ret = true; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + ret = false; + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + return ret; +} + static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_qgroup_list *prealloc = NULL; struct btrfs_trans_handle *trans; int ret; int err; @@ -3764,6 +3835,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3774,14 +3848,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto drop_write; } + if (sa->assign) { + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; + goto drop_write; + } + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } + /* + * Prealloc ownership is moved to the relation handler, there it's used + * or freed on error. + */ if (sa->assign) { - ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); + ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc); + prealloc = NULL; } else { ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } @@ -3791,13 +3878,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) err = btrfs_run_qgroups(trans); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) - btrfs_handle_fs_error(fs_info, err, - "failed to update qgroup status and info"); + btrfs_warn(fs_info, + "qgroup status update failed after %s relation, marked as inconsistent", + sa->assign ? "adding" : "deleting"); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: + kfree(prealloc); kfree(sa); drop_write: mnt_drop_write_file(file); @@ -3816,6 +3905,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3872,6 +3964,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(root->fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3891,7 +3986,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ - qgroupid = root->root_key.objectid; + qgroupid = btrfs_root_id(root); } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); @@ -3910,13 +4005,16 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!qgroup_enabled(fs_info)) + return -ENOTCONN; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3974,7 +4072,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -4022,7 +4120,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4046,7 +4144,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4162,7 +4260,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; @@ -4305,7 +4403,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; @@ -4373,7 +4471,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4416,12 +4514,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4474,7 +4577,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4561,29 +4689,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) @@ -4596,7 +4724,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; @@ -4665,11 +4793,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: @@ -4695,10 +4822,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(inode, argp, false); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(inode, argp, true); + return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index d51b9a2f2f6e..19cd26b0244a 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -3,6 +3,15 @@ #ifndef BTRFS_IOCTL_H #define BTRFS_IOCTL_H +#include <linux/types.h> + +struct file; +struct dentry; +struct mnt_idmap; +struct fileattr; +struct btrfs_fs_info; +struct btrfs_ioctl_balance_args; + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); @@ -10,7 +19,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); +int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 7979449a58d6..6a0b7abb5bd9 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,11 +8,11 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include <trace/events/btrfs.h> #include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" -#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, + { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, { .id = 0, DEFINE_NAME("tree") }, }; @@ -83,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int { struct btrfs_lockdep_keyset *ks; - BUG_ON(level >= ARRAY_SIZE(ks->keys)); + ASSERT(level < ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) @@ -96,12 +97,21 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) { if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) - btrfs_set_buffer_lockdep_class(root->root_key.objectid, + btrfs_set_buffer_lockdep_class(btrfs_root_id(root), eb, btrfs_header_level(eb)); } #endif +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) +{ + eb->lock_owner = owner; +} +#else +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } +#endif + /* * Extent buffer locking * ===================== @@ -119,14 +129,14 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff */ /* - * __btrfs_tree_read_lock - lock extent buffer for read + * btrfs_tree_read_lock_nested - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; @@ -137,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne trace_btrfs_tree_read_lock(eb, start_ns); } -void btrfs_tree_read_lock(struct extent_buffer *eb) -{ - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Try-lock for read. * @@ -164,7 +169,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -181,13 +186,14 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) } /* - * __btrfs_tree_lock - lock eb for write + * Lock eb for write. + * * @eb: the eb to lock * @nest: the nesting to use for the lock * * Returns with the eb->lock write locked. */ -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -196,22 +202,17 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } -void btrfs_tree_lock(struct extent_buffer *eb) -{ - __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Release the write lock. */ void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); - eb->lock_owner = 0; + btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); } @@ -363,8 +364,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - atomic_dec(&lock->writers); - cond_wake_up(&lock->pending_readers); + /* + * atomic_dec_and_test() implies a full barrier, so woken up readers are + * guaranteed to see the decrement. + */ + if (atomic_dec_and_test(&lock->writers)) + wake_up(&lock->pending_readers); } void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7d6ee1e609bf..3c15c75e0582 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -8,9 +8,14 @@ #include <linux/atomic.h> #include <linux/wait.h> +#include <linux/lockdep.h> #include <linux/percpu_counter.h> #include "extent_io.h" +struct extent_buffer; +struct btrfs_path; +struct btrfs_root; + #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 @@ -157,14 +162,22 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -struct btrfs_path; +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_lock(struct extent_buffer *eb) +{ + btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL); +} -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL); +} + void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c index 0fe0ae54ac67..fd88af17d8d9 100644 --- a/fs/btrfs/lru_cache.c +++ b/fs/btrfs/lru_cache.c @@ -9,7 +9,7 @@ * * @cache: The cache. * @max_size: Maximum size (number of entries) for the cache. - * Use 0 for unlimited size, it's the user's responsability to + * Use 0 for unlimited size, it's the user's responsibility to * trim the cache in that case. */ void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index 00328c856be6..07f1bb1c6aa3 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -3,6 +3,7 @@ #ifndef BTRFS_LRU_CACHE_H #define BTRFS_LRU_CACHE_H +#include <linux/types.h> #include <linux/maple_tree.h> #include <linux/list.h> @@ -50,11 +51,6 @@ struct btrfs_lru_cache { #define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -{ - return cache->size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d3fcfc628a4f..72856f6775f7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf) */ static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, - struct page **out_pages, - unsigned long max_nr_page, + struct folio **out_folios, + unsigned long max_nr_folio, u32 *cur_out, const u32 sectorsize) { u32 sector_bytes_left; u32 orig_out; - struct page *cur_page; + struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; /* @@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -209,15 +209,15 @@ out: return 0; } -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; - struct page *page_in = NULL; + const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + struct folio *folio_in = NULL; char *sizes_ptr; - const unsigned long max_nr_page = *out_pages; + const unsigned long max_nr_folio = *out_folios; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; - ASSERT(max_nr_page > 0); - *out_pages = 0; + ASSERT(max_nr_folio > 0); + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -243,28 +243,29 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, size_t out_len; /* Get the input page first */ - if (!page_in) { - page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); - ASSERT(page_in); + if (!folio_in) { + ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); + if (ret < 0) + goto out; } /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_page(page_in); + data_in = kmap_local_folio(folio_in, 0); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); - if (ret < 0) { - pr_debug("BTRFS: lzo in loop returned %d\n", ret); + if (unlikely(ret < 0)) { + /* lzo1x_1_compress never fails. */ ret = -EIO; goto out; } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, max_nr_page, + folios, max_nr_folio, &cur_out, sectorsize); if (ret < 0) goto out; @@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we have reached page boundary */ if (PAGE_ALIGNED(cur_in)) { - put_page(page_in); - page_in = NULL; + folio_put(folio_in); + folio_in = NULL; } } /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = kmap_local_folio(folios[0], 0); write_compress_length(sizes_ptr, cur_out); kunmap_local(sizes_ptr); @@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = cur_out; *total_in = cur_in - start; out: - if (page_in) - put_page(page_in); - *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); + if (folio_in) + folio_put(folio_in); + *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct page *cur_page; + struct folio *cur_folio; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); ASSERT(copy_len); - cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; - memcpy_from_page(dest + *cur_in - orig_in, cur_page, - offset_in_page(*cur_in), copy_len); + memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, + offset_in_folio(cur_folio, *cur_in), copy_len); *cur_in += copy_len; } @@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap_local_page(cb->compressed_pages[0]); + kaddr = kmap_local_folio(cb->compressed_folios[0], 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -353,17 +354,20 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) * and all sectors should be used. * If this happens, it means the compressed extent is corrupted. */ - if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || - round_up(len_in, sectorsize) < cb->compressed_len) { + if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || + round_up(len_in, sectorsize) < cb->compressed_len)) { + struct btrfs_inode *inode = cb->bbio.inode; + btrfs_err(fs_info, - "invalid lzo header, lzo len %u compressed len %u", - len_in, cb->compressed_len); +"lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u", + btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start, len_in, cb->compressed_len); return -EUCLEAN; } /* Go through each lzo segment */ while (cur_in < len_in) { - struct page *cur_page; + struct folio *cur_folio; /* Length of the compressed segment */ u32 seg_len; u32 sector_bytes_left; @@ -375,20 +379,24 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; - ASSERT(cur_page); - kaddr = kmap_local_page(cur_page); + cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + ASSERT(cur_folio); + kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; - if (seg_len > WORKSPACE_CBUF_LENGTH) { + if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + struct btrfs_inode *inode = cb->bbio.inode; + /* * seg_len shouldn't be larger than we have allocated * for workspace->cbuf */ - btrfs_err(fs_info, "unexpectedly large lzo segment len %u", - seg_len); + btrfs_err(fs_info, + "lzo segment too big, root %llu inode %llu offset %llu len %u", + btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start, seg_len); return -EIO; } @@ -398,8 +406,13 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Decompress the data */ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, workspace->buf, &out_len); - if (ret != LZO_E_OK) { - btrfs_err(fs_info, "failed to decompress"); + if (unlikely(ret != LZO_E_OK)) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(fs_info, + "lzo decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + cb->start); return -EIO; } @@ -425,16 +438,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,37 +464,26 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); - if (ret != LZO_E_OK) { - pr_warn("BTRFS: decompress failed!\n"); + if (unlikely(ret != LZO_E_OK)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(fs_info, + "lzo decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); ret = -EIO; goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 7695decc7243..363fd28c0268 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -3,13 +3,11 @@ #include "fs.h" #include "messages.h" #include "discard.h" -#include "transaction.h" -#include "space-info.h" #include "super.h" #ifdef CONFIG_PRINTK -#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_PREFACE " state " #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* @@ -22,7 +20,8 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', + [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', }; @@ -72,11 +71,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) * over the error. Each subsequent error that doesn't have any context * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. */ -const char * __attribute_const__ btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int error) { char *errstr = "unknown"; - switch (errno) { + switch (error) { case -ENOENT: /* -2 */ errstr = "No such entry"; break; @@ -110,12 +109,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno) } /* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. + * Decodes expected errors from the caller and invokes the appropriate error + * response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK @@ -132,11 +131,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already under * SB_RDONLY, then it is safe here. */ - if (errno == -EROFS && sb_rdonly(sb)) + if (error == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; @@ -147,11 +146,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.va = &args; pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); + sb->s_id, statestr, function, line, error, errstr, &vaf); va_end(args); } else { pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); + sb->s_id, statestr, function, line, error, errstr); } #endif @@ -159,7 +158,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - WRITE_ONCE(fs_info->fs_error, errno); + WRITE_ONCE(fs_info->fs_error, error); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) @@ -240,7 +239,8 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, vaf.fmt = fmt; vaf.va = &args; - if (__ratelimit(ratelimit)) { + /* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) { if (fs_info) { char statestr[STATE_STRING_BUF_LEN]; @@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an - * alert, and either panics or BUGs, depending on mount options. + * Decode unexpected, fatal errors from the caller, issue an alert, and either + * panic or BUGs, depending on mount options. */ __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; const char *errstr; @@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + s_id, function, line, &vaf, error, errstr); btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); + function, line, &vaf, error, errstr); va_end(args); /* Caller calls BUG() */ } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 1ae6f8e23e07..08a9272399d2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -184,25 +184,25 @@ do { \ __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); -const char * __attribute_const__ btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int error); -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) + (error), fmt, ##args) __printf(5, 6) __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ +#define btrfs_panic(fs_info, error, fmt, args...) \ do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \ BUG(); \ } while (0) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 40f2d9f1a17a..0d599fd847c9 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -3,6 +3,8 @@ #ifndef BTRFS_MISC_H #define BTRFS_MISC_H +#include <linux/types.h> +#include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/math64.h> @@ -64,7 +66,7 @@ struct rb_simple_node { u64 bytenr; }; -static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) +static inline struct rb_node *rb_simple_search(const struct rb_root *root, u64 bytenr) { struct rb_node *node = root->rb_node; struct rb_simple_node *entry; @@ -91,7 +93,7 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) * Return the rb_node that start at or after @bytenr. If there is no entry at * or after @bytner return NULL. */ -static inline struct rb_node *rb_simple_search_first(struct rb_root *root, +static inline struct rb_node *rb_simple_search_first(const struct rb_root *root, u64 bytenr) { struct rb_node *node = root->rb_node, *ret = NULL; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8a3c46cb67f5..880f9553d79d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,7 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" +#include "block-group.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, * look find the first ordered struct that has this offset, otherwise * the first one less than this offset */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) +static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, + u64 file_offset) { - struct rb_root *root = &tree->tree; struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, + if (inode->ordered_tree_last) { + entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, rb_node); if (in_range(file_offset, entry->file_offset, entry->num_bytes)) - return tree->last; + return inode->ordered_tree_last; } - ret = __tree_search(root, file_offset, &prev); + ret = __tree_search(&inode->ordered_tree, file_offset, &prev); if (!ret) ret = prev; if (ret) - tree->last = ret; + inode->ordered_tree_last = ret; return ret; } @@ -154,9 +153,10 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + if (is_nocow) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) @@ -171,8 +171,13 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) + if (!entry) { + if (!is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(-ENOMEM); + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -181,7 +186,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = igrab(&inode->vfs_inode); + entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -192,6 +197,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); + INIT_LIST_HEAD(&entry->bioc_list); init_completion(&entry->completion); /* @@ -208,8 +214,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { - struct btrfs_inode *inode = BTRFS_I(entry->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_inode *inode = entry->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -222,13 +227,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); - if (node) + spin_lock_irq(&inode->ordered_tree_lock); + node = tree_insert(&inode->ordered_tree, entry->file_offset, + &entry->rb_node); + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -253,7 +259,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -264,17 +270,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) */ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) + const struct btrfs_file_extent *file_extent, unsigned long flags) { struct btrfs_ordered_extent *entry; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, - disk_bytenr, disk_num_bytes, offset, flags, - compress_type); + /* + * For regular writes, we just use the members in @file_extent. + * + * For NOCOW, we don't really care about the numbers except @start and + * file_extent->num_bytes, as we won't insert a file extent item at all. + * + * For PREALLOC, we do not use ordered extent members, but + * btrfs_mark_extent_written() handles everything. + * + * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, + * or btrfs_split_ordered_extent() cannot handle it correctly. + */ + if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->num_bytes, + file_extent->disk_bytenr + file_extent->offset, + file_extent->num_bytes, 0, flags, + file_extent->compression); + else + entry = alloc_ordered_extent(inode, file_offset, + file_extent->num_bytes, + file_extent->ram_bytes, + file_extent->disk_bytenr, + file_extent->disk_num_bytes, + file_extent->offset, flags, + file_extent->compression); if (!IS_ERR(entry)) insert_ordered_extent(entry); return entry; @@ -288,12 +316,17 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_inode *inode = entry->inode; - tree = &BTRFS_I(entry->inode)->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); +} + +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) +{ + if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); } static void finish_ordered_fn(struct btrfs_work *work) @@ -305,18 +338,18 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - lockdep_assert_held(&inode->ordered_tree.lock); + lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -324,16 +357,16 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_page_clear_ordered(fs_info, page, file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", - inode->root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(inode->root), btrfs_ino(inode), ordered->file_offset, ordered->num_bytes, len, ordered->bytes_left); ordered->bytes_left = 0; @@ -360,39 +393,70 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); btrfs_queue_work(wq, &ordered->work); } -bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, +void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree.lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); if (ret) btrfs_queue_ordered_fn(ordered); - return ret; } /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -402,10 +466,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; @@ -415,13 +478,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, file_offset + num_bytes - 1, uptodate); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; u64 end; u32 len; - node = tree_search(tree, cur); + node = ordered_tree_search(inode, cur); /* No ordered extents at all */ if (!node) break; @@ -467,14 +530,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { - spin_unlock_irqrestore(&tree->lock, flags); + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); } cur += len; } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); } /* @@ -498,19 +561,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; bool finished = false; - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); if (cached && *cached) { entry = *cached; goto have_entry; } - node = tree_search(tree, file_offset); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -541,7 +603,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return finished; } @@ -554,14 +616,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); + trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(BTRFS_I(entry->inode)); + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -579,7 +641,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -593,7 +654,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_alloc_ordered_extent. */ + /* This is paired with alloc_ordered_extent(). */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); @@ -612,16 +673,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - tree = &btrfs_inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (btrfs_inode->ordered_tree_last == node) + btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -680,11 +740,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) } /* - * wait for all the ordered extents in a root. This is done when balancing - * space between drives. + * Wait for all the ordered extents in a root. Use @bg as range or do whole + * range if it's NULL. */ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(splice); @@ -692,7 +752,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; u64 count = 0; - const u64 range_end = range_start + range_len; + u64 range_start, range_len; + u64 range_end; + + if (bg) { + range_start = bg->start; + range_len = bg->length; + } else { + range_start = 0; + range_len = U64_MAX; + } + range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -714,15 +784,15 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_run_ordered_extent_work, NULL, NULL); + btrfs_run_ordered_extent_work, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); cond_resched(); - spin_lock(&root->ordered_extent_lock); if (nr != U64_MAX) nr--; count++; + spin_lock(&root->ordered_extent_lock); } list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); @@ -739,8 +809,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, return count; } +/* + * Wait for @nr ordered extents that intersect the @bg, or the whole range of + * the filesystem if @bg is NULL. + */ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - const u64 range_start, const u64 range_len) + const struct btrfs_block_group *bg) { struct btrfs_root *root; LIST_HEAD(splice); @@ -758,14 +832,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr, - range_start, range_len); + done = btrfs_wait_ordered_extents(root, nr, bg); btrfs_put_root(root); - spin_lock(&fs_info->ordered_root_lock); - if (nr != U64_MAX) { + if (nr != U64_MAX) nr -= done; - } + + spin_lock(&fs_info->ordered_root_lock); } list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); @@ -782,7 +855,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; - struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_inode *inode = entry->inode; bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); @@ -809,7 +882,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) { int ret = 0; int ret_wb = 0; @@ -839,11 +912,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * before the ordered extents complete - to avoid failures (-EEXIST) * when adding the new ordered extents to the ordered tree. */ - ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); + ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -878,14 +951,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - tree = &inode->ordered_tree; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, file_offset); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -897,7 +968,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } @@ -907,15 +978,13 @@ out: struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) { - node = tree_search(tree, file_offset + len); + node = ordered_tree_search(inode, file_offset + len); if (!node) goto out; } @@ -939,7 +1008,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -950,13 +1019,12 @@ out: void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); - spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + spin_lock_irq(&inode->ordered_tree_lock); + for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); @@ -969,7 +1037,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } /* @@ -979,13 +1047,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -993,7 +1059,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1009,15 +1075,14 @@ out: struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct rb_node *cur; struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&tree->lock); - node = tree->tree.rb_node; + spin_lock_irq(&inode->ordered_tree_lock); + node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last * and screw up the search order. @@ -1071,7 +1136,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1149,8 +1214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_inode *inode = ordered->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1171,6 +1235,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); @@ -1189,15 +1265,32 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* One ref for the tree. */ refcount_inc(&new->refs); + /* + * Take the root's ordered_extent_lock to avoid a race with + * btrfs_wait_ordered_extents() when updating the disk_bytenr and + * disk_num_bytes fields of the ordered extent below. And we disable + * IRQs because the inode's ordered_tree_lock is used in IRQ context + * elsewhere. + * + * There's no concern about a previous caller of + * btrfs_wait_ordered_extents() getting the trimmed ordered extent + * before we insert the new one, because even if it gets the ordered + * extent before it's trimmed and the new one inserted, right before it + * uses it or during its use, the ordered extent might have been + * trimmed in the meanwhile, and it missed the new ordered extent. + * There's no way around this and it's harmless for current use cases, + * so we take the root's ordered_extent_lock to fix that race during + * trimming and silence tools like KCSAN. + */ spin_lock_irq(&root->ordered_extent_lock); - spin_lock(&tree->lock); - /* Remove from tree once */ - node = &ordered->rb_node; - rb_erase(node, &tree->tree); - RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + spin_lock(&inode->ordered_tree_lock); + /* + * We don't have overlapping ordered extents (that would imply double + * allocation of extents) and we checked above that the split length + * does not cross the ordered extent's num_bytes field, so there's + * no need to remove it and re-insert it in the tree. + */ ordered->file_offset += len; ordered->disk_bytenr += len; ordered->num_bytes -= len; @@ -1227,19 +1320,12 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( offset += sum->len; } - /* Re-insert the node */ - node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); - if (node) - btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); - - node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); - if (node) + node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); + if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, - "zoned: inconsistency in ordered tree at offset %llu", + "inconsistency in ordered tree at offset %llu after split", new->file_offset); - spin_unlock(&tree->lock); + spin_unlock(&inode->ordered_tree_lock); list_add_tail(&new->root_extent_list, &root->ordered_extents); root->nr_ordered_extents++; @@ -1249,10 +1335,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 173bd5c5df26..4e152736d06c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,12 +6,20 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H -/* one of these per inode */ -struct btrfs_ordered_inode_tree { - spinlock_t lock; - struct rb_root tree; - struct rb_node *last; -}; +#include <linux/types.h> +#include <linux/list.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/wait.h> +#include "async-thread.h" + +struct inode; +struct page; +struct extent_state; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_fs_info; struct btrfs_ordered_sum { /* @@ -104,13 +112,6 @@ struct btrfs_ordered_extent { u64 bytes_left; /* - * the end of the ordered extent which is behind it but - * didn't update disk_i_size. Please see the comment of - * btrfs_ordered_update_i_size(); - */ - u64 outstanding_isize; - - /* * If we get truncated we need to adjust the file extent we enter for * this ordered extent so that we do not expose stale data. */ @@ -129,7 +130,7 @@ struct btrfs_ordered_extent { refcount_t refs; /* the inode we belong to */ - struct inode *inode; + struct btrfs_inode *inode; /* list of checksums for insertion when the extent io is done */ struct list_head list; @@ -151,15 +152,9 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; -}; -static inline void -btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) -{ - spin_lock_init(&t->lock); - t->tree = RB_ROOT; - t->last = NULL; -} + struct list_head bioc_list; +}; int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); @@ -167,26 +162,37 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, +void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); + +/* + * This represents details about the target file extent item of a write operation. + */ +struct btrfs_file_extent { + u64 disk_bytenr; + u64 disk_num_bytes; + u64 num_bytes; + u64 ram_bytes; + u64 offset; + u8 compression; +}; + struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); + const struct btrfs_file_extent *file_extent, unsigned long flags); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( @@ -198,9 +204,9 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, - const u64 range_start, const u64 range_len); + const struct btrfs_block_group *bg); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - const u64 range_start, const u64 range_len); + const struct btrfs_block_group *bg); void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); @@ -208,6 +214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len); +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 7a1b021b5669..9f3ad124104f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -4,15 +4,13 @@ */ #include "ctree.h" -#include "disk-io.h" #include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -22,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -45,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); + return ret; + if (ret) + return -ENOENT; -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h index 3faab5cbb59a..aa54a88a60de 100644 --- a/fs/btrfs/orphan.h +++ b/fs/btrfs/orphan.h @@ -3,6 +3,11 @@ #ifndef BTRFS_ORPHAN_H #define BTRFS_ORPHAN_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; + int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 815a5fc3ff9d..fc821aa446f0 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -9,6 +9,8 @@ #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" +#include "volumes.h" +#include "raid-stripe-tree.h" struct root_name_map { u64 id; @@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = { { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, + { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" }, }; const char *btrfs_root_name(const struct btrfs_key *key, char *buf) @@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } +static void print_extent_owner_ref(const struct extent_buffer *eb, + const struct btrfs_extent_owner_ref *ref) +{ + ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA)); + pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref)); +} + static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + struct btrfs_extent_owner_ref *oref; struct btrfs_disk_key key; unsigned long end; unsigned long ptr; @@ -98,7 +109,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type btrfs_err(eb->fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); + return; } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + print_extent_owner_ref(eb, oref); + break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", eb->start, type); @@ -189,6 +204,17 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, } } +static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size, + struct btrfs_stripe_extent *stripe) +{ + const int num_stripes = btrfs_num_raid_stripes(item_size); + + for (int i = 0; i < num_stripes; i++) + pr_info("\t\t\tstride %d devid %llu physical %llu\n", + i, btrfs_raid_stride_devid(eb, &stripe->strides[i]), + btrfs_raid_stride_physical(eb, &stripe->strides[i])); +} + /* * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. @@ -279,6 +305,9 @@ void btrfs_print_leaf(const struct extent_buffer *l) case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(l, fi), + btrfs_file_extent_type(l, fi)); if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { pr_info("\t\tinline extent data size %llu\n", @@ -349,6 +378,10 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size(l, i)); break; + case BTRFS_RAID_STRIPE_KEY: + print_raid_stripe_key(l, btrfs_item_size(l, i), + btrfs_item_ptr(l, i, struct btrfs_stripe_extent)); + break; } } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index c42bc666d5ee..8504bf1702c7 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,6 +9,9 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 +struct extent_buffer; +struct btrfs_key; + void btrfs_print_leaf(const struct extent_buffer *l); void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 0755af0e53e3..b8fa34e16abb 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include <linux/hashtable.h> +#include <linux/xattr.h> #include "messages.h" #include "props.h" #include "btrfs_inode.h" @@ -15,6 +16,7 @@ #include "fs.h" #include "accessors.h" #include "super.h" +#include "dir-item.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); @@ -25,7 +27,7 @@ struct prop_handler { int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(struct inode *inode); + const char *(*extract)(const struct inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -102,7 +104,7 @@ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) return handler->ignore(inode); } -int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const char *name, const char *value, size_t value_len, int flags) { @@ -114,29 +116,29 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, return -EINVAL; if (value_len == 0) { - ret = btrfs_setxattr(trans, inode, handler->xattr_name, + ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); if (ret) return ret; - ret = handler->apply(inode, NULL, 0); + ret = handler->apply(&inode->vfs_inode, NULL, 0); ASSERT(ret == 0); return ret; } - ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value, value_len, flags); if (ret) return ret; - ret = handler->apply(inode, value, value_len); + ret = handler->apply(&inode->vfs_inode, value, value_len); if (ret) { - btrfs_setxattr(trans, inode, handler->xattr_name, NULL, + btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); return ret; } - set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); return 0; } @@ -266,7 +268,7 @@ static void inode_prop_iterator(void *ctx, btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", handler->xattr_name, btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } @@ -301,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode, static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int type; /* Reset to defaults */ @@ -357,7 +359,7 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(struct inode *inode) +static const char *prop_compression_extract(const struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: @@ -383,7 +385,7 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *parent) + struct inode *inode, const struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 6e283196e38a..63546d0a9444 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,11 +6,16 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H -#include "ctree.h" +#include <linux/compiler_types.h> + +struct inode; +struct btrfs_inode; +struct btrfs_path; +struct btrfs_trans_handle; int __init btrfs_props_init(void); -int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const char *name, const char *value, size_t value_len, int flags); int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, @@ -21,6 +26,6 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, - struct inode *dir); + const struct inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1b9f4f16d124..6b181bf9f156 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,6 +30,25 @@ #include "root-tree.h" #include "tree-checker.h" +enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info) +{ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return BTRFS_QGROUP_MODE_DISABLED; + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) + return BTRFS_QGROUP_MODE_SIMPLE; + return BTRFS_QGROUP_MODE_FULL; +} + +bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; +} + +bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; +} + /* * Helpers to access qgroup reservation * @@ -88,7 +107,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, - struct btrfs_qgroup *src) + const struct btrfs_qgroup *src) { int i; @@ -98,7 +117,7 @@ static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *dest, - struct btrfs_qgroup *src) + const struct btrfs_qgroup *src) { int i; @@ -122,47 +141,27 @@ static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, qg->new_refcnt += mod; } -static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->old_refcnt < seq) return 0; return qg->old_refcnt - seq; } -static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq) { if (qg->new_refcnt < seq) return 0; return qg->new_refcnt - seq; } -/* - * glue structure to represent the relations between qgroups. - */ -struct btrfs_qgroup_list { - struct list_head next_group; - struct list_head next_member; - struct btrfs_qgroup *group; - struct btrfs_qgroup *member; -}; - -static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) -{ - return (u64)(uintptr_t)qg; -} - -static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) -{ - return (struct btrfs_qgroup *)(uintptr_t)n->aux; -} - static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ -static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, +static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { struct rb_node *n = fs_info->qgroup_tree.rb_node; @@ -180,35 +179,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, return NULL; } -/* must be called with qgroup_lock held */ +/* + * Add qgroup to the filesystem's qgroup tree. + * + * Must be called with qgroup_lock held and @prealloc preallocated. + * + * The control on the lifespan of @prealloc would be transferred to this + * function, thus caller should no longer touch @prealloc. + */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; + /* Caller must have pre-allocated @prealloc. */ + ASSERT(prealloc); + while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) + if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; - else if (qgroup->qgroupid > qgroupid) + } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; - else + } else { + kfree(prealloc); return qgroup; + } } - qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); - if (!qgroup) - return ERR_PTR(-ENOMEM); - + qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); INIT_LIST_HEAD(&qgroup->iterator); + INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); @@ -255,27 +265,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) /* * Add relation specified by two qgroups. * - * Must be called with qgroup_lock held. + * Must be called with qgroup_lock held, the ownership of @prealloc is + * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ -static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) +static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, + struct btrfs_qgroup *member, + struct btrfs_qgroup *parent) { - struct btrfs_qgroup_list *list; - - if (!member || !parent) + if (!member || !parent) { + kfree(prealloc); return -ENOENT; + } - list = kzalloc(sizeof(*list), GFP_ATOMIC); - if (!list) - return -ENOMEM; - - list->group = parent; - list->member = member; - list_add_tail(&list->next_group, &member->groups); - list_add_tail(&list->next_member, &parent->members); + prealloc->group = parent; + prealloc->member = member; + list_add_tail(&prealloc->next_group, &member->groups); + list_add_tail(&prealloc->next_member, &parent->members); return 0; } @@ -289,7 +298,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p * -ENOENT if one of the ids does not exist * <0 other errors */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +static int add_relation_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_list *prealloc, + u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; @@ -297,7 +308,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); - return __add_relation_rb(member, parent); + return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ @@ -325,7 +336,7 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl) { struct btrfs_qgroup *qgroup; @@ -341,11 +352,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } +static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot, + struct btrfs_qgroup_status_item *ptr) +{ + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); + fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -362,7 +384,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) u64 flags = 0; u64 rescan_progress = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); @@ -412,14 +434,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) "old qgroup version, quota disabled"); goto out; } - if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation) { + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + qgroup_read_enable_gen(fs_info, l, slot, ptr); + } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } - fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, - ptr); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -435,11 +457,34 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup_mark_inconsistent(fs_info); } if (!qgroup) { - qgroup = add_qgroup_rb(fs_info, found_key.offset); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); + struct btrfs_qgroup *prealloc; + struct btrfs_root *tree_root = fs_info->tree_root; + + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; goto out; } + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + /* + * If a qgroup exists for a subvolume ID, it is possible + * that subvolume has been deleted, in which case + * re-using that ID would lead to incorrect accounting. + * + * Ensure that we skip any such subvol ids. + * + * We don't need to lock because this is only called + * during mount before we start doing things like creating + * subvolumes. + */ + if (is_fstree(qgroup->qgroupid) && + qgroup->qgroupid > tree_root->free_objectid) + /* + * Don't need to check against BTRFS_LAST_FREE_OBJECTID, + * as it will get checked on the next call to + * btrfs_get_free_objectid. + */ + tree_root->free_objectid = qgroup->qgroupid + 1; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -490,6 +535,8 @@ next1: if (ret) goto out; while (1) { + struct btrfs_qgroup_list *list = NULL; + slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); @@ -503,8 +550,14 @@ next1: goto next2; } - ret = add_relation_rb(fs_info, found_key.objectid, + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) { + ret = -ENOMEM; + goto out; + } + ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); + list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", @@ -523,13 +576,12 @@ next2: out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; - if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && - ret >= 0) - ret = qgroup_rescan_init(fs_info, rescan_progress, 0); - - if (ret < 0) { + if (ret >= 0) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -546,12 +598,12 @@ out: * Return false if no reserved space is left. * Return true if some reserved space is leaked. */ -bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) { struct rb_node *node; bool ret = false; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup @@ -950,7 +1002,8 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -960,8 +1013,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; + const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1064,8 +1119,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); - fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; + if (simple) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + } else { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); @@ -1095,6 +1156,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); + /* We should not have a stray @prealloc pointer. */ + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1102,7 +1172,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, found_key.offset); + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); @@ -1145,18 +1216,22 @@ out_add_root: goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; goto out_free_path; } + qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); + prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } + fs_info->qgroup_enable_gen = trans->transid; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid @@ -1183,6 +1258,10 @@ out_add_root: set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_unlock(&fs_info->qgroup_lock); + /* Skip rescan for simple qgroups. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + goto out_free_path; + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1223,9 +1302,37 @@ out: else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); + kfree(prealloc); return ret; } +/* + * It is possible to have outstanding ordered extents which reserved bytes + * before we disabled. We need to fully flush delalloc, ordered extents, and a + * commit to ensure that we don't leak such reservations, only to have them + * come back if we re-enable. + * + * - enable simple quotas + * - reserve space + * - release it, store rsv_bytes in OE + * - disable quotas + * - enable simple quotas (qgroup rsv are all 0) + * - OE finishes + * - run delayed refs + * - free rsv_bytes, resulting in miscounting or even underflow + */ +static int flush_reservations(struct btrfs_fs_info *fs_info) +{ + int ret; + + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) + return ret; + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); + + return btrfs_commit_current_transaction(fs_info->tree_root); +} + int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root = NULL; @@ -1239,16 +1346,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) lockdep_assert_held_write(&fs_info->subvol_sem); /* - * Lock the cleaner mutex to prevent races with concurrent relocation, - * because relocation may be building backrefs for blocks of the quota - * root while we are deleting the root. This is like dropping fs roots - * of deleted snapshots/subvolumes, we need the same protection. - * - * This also prevents races between concurrent tasks trying to disable - * quotas, because we will unlock and relock qgroup_ioctl_lock across - * BTRFS_FS_QUOTA_ENABLED changes. + * Relocation will mess with backrefs, so make sure we have the + * cleaner_mutex held to protect us from relocate. */ - mutex_lock(&fs_info->cleaner_mutex); + lockdep_assert_held(&fs_info->cleaner_mutex); mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) @@ -1271,6 +1372,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_qgroup_wait_for_completion(fs_info, false); /* + * We have nothing held here and no trans handle, just return the error + * if there is one and set back the quota enabled bit since we didn't + * actually disable quotas. + */ + ret = flush_reservations(fs_info); + if (ret) { + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + return ret; + } + + /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in @@ -1296,7 +1408,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1332,9 +1445,7 @@ out: if (ret && trans) btrfs_end_transaction(trans); else if (trans) - ret = btrfs_end_transaction(trans); - mutex_unlock(&fs_info->cleaner_mutex); - + ret = btrfs_commit_transaction(trans); return ret; } @@ -1377,14 +1488,11 @@ static void qgroup_iterator_clean(struct list_head *head) * * Caller should hold fs_info->qgroup_lock. */ -static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 ref_root, +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; + LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; @@ -1392,53 +1500,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, if (!qgroup) goto out; - qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < num_bytes); - qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; - - if (sign > 0) - qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); - else - qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + /* Append parent qgroups to @qgroup_list. */ + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: + qgroup_iterator_clean(&qgroup_list); return ret; } @@ -1455,24 +1540,19 @@ out: * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 src, u64 dst, - int sign) + u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; - int err = 0; qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) { - ret = 0; - err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup, sign); - if (err < 0) { - ret = err; + ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); + if (ret < 0) goto out; - } + ret = 0; } out: if (ret) @@ -1480,28 +1560,25 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst) +/* + * Add relation between @src and @dst qgroup. The @prealloc is allocated by the + * callers and transferred here (either used or freed on error). + */ +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, + struct btrfs_qgroup_list *prealloc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; - unsigned int nofs_flag; int ret = 0; + ASSERT(prealloc); + /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1533,16 +1610,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = __add_relation_rb(member, parent); + ret = __add_relation_rb(prealloc, member, parent); + prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } - ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: + kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); - ulist_free(tmp); return ret; } @@ -1553,19 +1631,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; bool found = false; - unsigned int nofs_flag; int ret = 0; int ret2; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; @@ -1603,11 +1672,10 @@ delete_item: if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: - ulist_free(tmp); return ret; } @@ -1629,6 +1697,7 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *prealloc = NULL; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); @@ -1643,31 +1712,77 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } + ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); - qgroup = add_qgroup_rb(fs_info, qgroupid); + qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); + prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - goto out; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + kfree(prealloc); return ret; } -static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +/* + * Return 0 if we can not delete the qgroup (not empty or has children etc). + * Return >0 if we can delete the qgroup. + * Return <0 for other errors during tree search. + */ +static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { - return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || - qgroup->excl > 0 || qgroup->excl_cmpr > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + /* + * Squota would never be inconsistent, but there can still be case + * where a dropped subvolume still has qgroup numbers, and squota + * relies on such qgroup for future accounting. + * + * So for squota, do not allow dropping any non-zero qgroup. + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && + (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) + return 0; + + /* For higher level qgroup, we can only delete it if it has no child. */ + if (btrfs_qgroup_level(qgroup->qgroupid)) { + if (!list_empty(&qgroup->members)) + return 0; + return 1; + } + + /* + * For level-0 qgroups, we can only delete it if it has no subvolume + * for it. + * This means even a subvolume is unlinked but not yet fully dropped, + * we can not delete the qgroup. + */ + key.objectid = qgroup->qgroupid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = -1ULL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); + btrfs_free_path(path); + /* + * The @ret from btrfs_find_root() exactly matches our definition for + * the return value, thus can be returned directly. + */ + return ret; } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -1689,7 +1804,10 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } - if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = can_delete_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + if (ret == 0) { ret = -EBUSY; goto out; } @@ -1714,6 +1832,45 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) } spin_lock(&fs_info->qgroup_lock); + /* + * Warn on reserved space. The subvolume should has no child nor + * corresponding subvolume. + * Thus its reserved space should all be zero, no matter if qgroup + * is consistent or the mode. + */ + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } + /* + * The same for rfer/excl numbers, but that's only if our qgroup is + * consistent and if it's in regular qgroup mode. + * For simple mode it's not as accurate thus we can hit non-zero values + * very frequently. + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rfer, qgroup->rfer_cmpr, + qgroup->excl, qgroup->excl_cmpr); + qgroup_mark_inconsistent(fs_info); + } + } del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1729,6 +1886,40 @@ out: return ret; } +int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid) +{ + struct btrfs_trans_handle *trans; + int ret; + + if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) + return 0; + + /* + * Commit current transaction to make sure all the rfer/excl numbers + * get updated. + */ + ret = btrfs_commit_current_transaction(fs_info->quota_root); + if (ret < 0) + return ret; + + /* Start new trans to delete the qgroup info and limit items. */ + trans = btrfs_start_transaction(fs_info->quota_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_remove_qgroup(trans, subvolid); + btrfs_end_transaction(trans); + /* + * It's squota and the subvolume still has numbers needed for future + * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. + */ + if (ret == -EBUSY || ret == -ENOENT) + ret = 0; + return ret; +} + int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { @@ -1806,47 +1997,90 @@ out: return ret; } +/* + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at transaction committing time. + * + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Return <0 for insertion failure, caller can free @record safely. + */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); + + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + +#if BITS_PER_LONG == 32 + if (record->bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + record->bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, index); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } +/* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; + if (!btrfs_qgroup_full_accounting(trans->fs_info)) + return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed @@ -1894,21 +2128,39 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, return 0; } +/* + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. + * + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) - || bytenr == 0 || num_bytes == 0) + if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; @@ -1917,13 +2169,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) { + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } return btrfs_qgroup_trace_extent_post(trans, record); } +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { @@ -1935,7 +2195,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { @@ -2311,7 +2571,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, int level; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ @@ -2354,6 +2614,16 @@ out: return ret; } +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2365,10 +2635,10 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); + ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); + ASSERT(root_eb != NULL); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); @@ -2480,62 +2750,64 @@ out: return ret; } +static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->nested_iterator)) + return; + + list_add_tail(&qgroup->nested_iterator, head); +} + +static void qgroup_iterator_nested_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); + list_del_init(&qgroup->nested_iterator); + } +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - struct ulist *qgroups, u64 seq, int update_old) +static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct list_head *qgroups, + u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret = 0; if (!roots) - return 0; + return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + LIST_HEAD(tmp); + qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + qgroup_iterator_nested_add(qgroups, qg); + qgroup_iterator_add(&tmp, qg); + list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); + list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; + qgroup_iterator_nested_add(qgroups, glist->group); + qgroup_iterator_add(&tmp, glist->group); } } + qgroup_iterator_clean(&tmp); } - return 0; } /* @@ -2574,22 +2846,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ -static int qgroup_update_counters(struct btrfs_fs_info *fs_info, - struct ulist *qgroups, - u64 nr_old_roots, - u64 nr_new_roots, - u64 num_bytes, u64 seq) +static void qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct list_head *qgroups, u64 nr_old_roots, + u64 nr_new_roots, u64 num_bytes, u64 seq) { - struct ulist_node *unode; - struct ulist_iterator uiter; struct btrfs_qgroup *qg; - u64 cur_new_count, cur_old_count; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(qgroups, &uiter))) { + list_for_each_entry(qg, qgroups, nested_iterator) { + u64 cur_new_count, cur_old_count; bool dirty = false; - qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -2660,7 +2926,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, if (dirty) qgroup_dirty(fs_info, qg); } - return 0; } /* @@ -2697,8 +2962,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct ulist *qgroups = NULL; - struct ulist *tmp = NULL; + LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; @@ -2708,7 +2972,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; @@ -2730,17 +2994,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) { - ret = -ENOMEM; - goto out_free; - } - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out_free; - } - mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { @@ -2755,29 +3008,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ - ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, - UPDATE_OLD); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ - ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, - UPDATE_NEW); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); - qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; -out: spin_unlock(&fs_info->qgroup_lock); out_free: - ulist_free(tmp); - ulist_free(qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; @@ -2789,17 +3040,17 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return 0; + delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - + xa_for_each(&delayed_refs->dirty_extents, index, record) { num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); @@ -2865,7 +3116,7 @@ cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -2909,7 +3160,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -2922,6 +3173,159 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size) +{ + if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) + return -EOPNOTSUPP; + if (size < sizeof(*inherit) || size > PAGE_SIZE) + return -EINVAL; + + /* + * In the past we allowed btrfs_qgroup_inherit to specify to copy + * rfer/excl numbers directly from other qgroups. This behavior has + * been disabled in userspace for a very long time, but here we should + * also disable it in kernel, as this behavior is known to mark qgroup + * inconsistent, and a rescan would wipe out the changes anyway. + * + * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. + */ + if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) + return -EINVAL; + + if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) + return -EINVAL; + + /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + + /* + * Now check all the remaining qgroups, they should all: + * + * - Exist + * - Be higher level qgroups. + */ + for (int i = 0; i < inherit->num_qgroups; i++) { + struct btrfs_qgroup *qgroup; + u64 qgroupid = inherit->qgroups[i]; + + if (btrfs_qgroup_level(qgroupid) == 0) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + spin_unlock(&fs_info->qgroup_lock); + return -ENOENT; + } + spin_unlock(&fs_info->qgroup_lock); + } + return 0; +} + +static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, + u64 inode_rootid, + struct btrfs_qgroup_inherit **inherit) +{ + int i = 0; + u64 num_qgroups = 0; + struct btrfs_qgroup *inode_qg; + struct btrfs_qgroup_list *qg_list; + struct btrfs_qgroup_inherit *res; + size_t struct_sz; + u64 *qgids; + + if (*inherit) + return -EEXIST; + + inode_qg = find_qgroup_rb(fs_info, inode_rootid); + if (!inode_qg) + return -ENOENT; + + num_qgroups = list_count_nodes(&inode_qg->groups); + + if (!num_qgroups) + return 0; + + struct_sz = struct_size(res, qgroups, num_qgroups); + if (struct_sz == SIZE_MAX) + return -ERANGE; + + res = kzalloc(struct_sz, GFP_NOFS); + if (!res) + return -ENOMEM; + res->num_qgroups = num_qgroups; + qgids = res->qgroups; + + list_for_each_entry(qg_list, &inode_qg->groups, next_group) + qgids[i++] = qg_list->group->qgroupid; + + *inherit = res; + return 0; +} + +/* + * Check if we can skip rescan when inheriting qgroups. If @src has a single + * @parent, and that @parent is owning all its bytes exclusively, we can skip + * the full rescan, by just adding nodesize to the @parent's excl/rfer. + * + * Return <0 for fatal errors (like srcid/parentid has no qgroup). + * Return 0 if a quick inherit is done. + * Return >0 if a quick inherit is not possible, and a full rescan is needed. + */ +static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, + u64 srcid, u64 parentid) +{ + struct btrfs_qgroup *src; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + int nr_parents = 0; + + src = find_qgroup_rb(fs_info, srcid); + if (!src) + return -ENOENT; + parent = find_qgroup_rb(fs_info, parentid); + if (!parent) + return -ENOENT; + + /* + * Source has no parent qgroup, but our new qgroup would have one. + * Qgroup numbers would become inconsistent. + */ + if (list_empty(&src->groups)) + return 1; + + list_for_each_entry(list, &src->groups, next_group) { + /* The parent is not the same, quick update is not possible. */ + if (list->group->qgroupid != parentid) + return 1; + nr_parents++; + /* + * More than one parent qgroup, we can't be sure about accounting + * consistency. + */ + if (nr_parents > 1) + return 1; + } + + /* + * The parent is not exclusively owning all its bytes. We're not sure + * if the source has any bytes not fully owned by the parent. + */ + if (parent->excl != parent->rfer) + return 1; + + parent->excl += fs_info->nodesize; + parent->rfer += fs_info->nodesize; + return 0; +} + /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will @@ -2929,20 +3333,27 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit) + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit) { int ret = 0; - int i; u64 *i_qgroups; bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; + struct btrfs_qgroup *prealloc; + struct btrfs_qgroup_list **qlist_prealloc = NULL; + bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) + return -ENOMEM; + /* * There are only two callers of this function. * @@ -2962,7 +3373,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; @@ -2971,11 +3382,18 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { + ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); + if (ret) + goto out; + free_inherit = true; + } + if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2 * inherit->num_excl_copies; - for (i = 0; i < nums; ++i) { + for (int i = 0; i < nums; i++) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); /* @@ -3002,7 +3420,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) { if (*i_qgroups == 0) continue; ret = add_qgroup_relation_item(trans, objectid, @@ -3015,16 +3433,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } ret = 0; - } + qlist_prealloc = kcalloc(inherit->num_qgroups, + sizeof(struct btrfs_qgroup_list *), + GFP_NOFS); + if (!qlist_prealloc) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < inherit->num_qgroups; i++) { + qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), + GFP_NOFS); + if (!qlist_prealloc[i]) { + ret = -ENOMEM; + goto out; + } + } + } spin_lock(&fs_info->qgroup_lock); - dstgroup = add_qgroup_rb(fs_info, objectid); - if (IS_ERR(dstgroup)) { - ret = PTR_ERR(dstgroup); - goto unlock; - } + dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); + prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; @@ -3036,7 +3466,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); } - if (srcid) { + if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; @@ -3063,29 +3493,40 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); + + /* + * If the source qgroup has parent but the new one doesn't, + * we need a full rescan. + */ + if (!inherit && !list_empty(&srcgroup->groups)) + need_rescan = true; } if (!inherit) goto unlock; i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (int i = 0; i < inherit->num_qgroups; i++) { if (*i_qgroups) { - ret = add_relation_rb(fs_info, objectid, *i_qgroups); + ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, + *i_qgroups); + qlist_prealloc[i] = NULL; if (ret) goto unlock; } + if (srcid) { + /* Check if we can do a quick inherit. */ + ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); + if (ret < 0) + goto unlock; + if (ret > 0) + need_rescan = true; + ret = 0; + } ++i_qgroups; - - /* - * If we're doing a snapshot, and adding the snapshot to a new - * qgroup, the numbers are guaranteed to be incorrect. - */ - if (srcid) - need_rescan = true; } - for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { + for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; @@ -3106,7 +3547,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, /* Manually tweaking numbers certainly needs a rescan */ need_rescan = true; } - for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { + for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; @@ -3135,6 +3576,14 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); + if (qlist_prealloc) { + for (int i = 0; i < inherit->num_qgroups; i++) + kfree(qlist_prealloc[i]); + kfree(qlist_prealloc); + } + if (free_inherit) + kfree(inherit); + kfree(prealloc); return ret; } @@ -3156,7 +3605,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, { struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - u64 ref_root = root->root_key.objectid; + u64 ref_root = btrfs_root_id(root); int ret = 0; LIST_HEAD(qgroup_list); @@ -3218,9 +3667,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; @@ -3248,30 +3695,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, */ num_bytes = qgroup->rsv.values[type]; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_release(fs_info, qg, num_bytes, type); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; + qgroup_rsv_release(fs_info, qgroup, num_bytes, type); + list_for_each_entry(glist, &qgroup->groups, next_group) { + qgroup_iterator_add(&qgroup_list, glist->group); } } - out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } @@ -3306,6 +3740,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int slot; int ret; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); @@ -3386,10 +3823,15 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { - return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + if (btrfs_fs_closing(fs_info)) + return true; + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + return true; + if (!btrfs_qgroup_enabled(fs_info)) + return true; + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + return true; + return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3398,14 +3840,18 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - int err = -ENOMEM; int ret = 0; bool stopped = false; bool did_leaf_rescans = false; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; + path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto out; + } /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup @@ -3413,18 +3859,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path->search_commit_root = 1; path->skip_locking = 1; - err = 0; - while (!err && !(stopped = rescan_should_stop(fs_info))) { + while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); break; } - err = qgroup_rescan_leaf(trans, path); + ret = qgroup_rescan_leaf(trans, path); did_leaf_rescans = true; - if (err > 0) + if (ret > 0) btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); @@ -3434,10 +3879,10 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (err > 0 && + if (ret > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0 || stopped) { + } else if (ret < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3452,11 +3897,11 @@ out: if (did_leaf_rescans) { trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", - err); + ret); } } else { trans = NULL; @@ -3467,11 +3912,11 @@ out: fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", - err); + int ret2 = update_qgroup_status_item(trans); + + if (ret2 < 0) { + ret = ret2; + btrfs_err(fs_info, "fail to update qgroup status: %d", ret); } } fs_info->qgroup_rescan_running = false; @@ -3488,11 +3933,11 @@ out: btrfs_info(fs_info, "qgroup scan paused"); } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { btrfs_info(fs_info, "qgroup scan cancelled"); - } else if (err >= 0) { + } else if (ret >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err > 0 ? " (inconsistency flag cleared)" : ""); + ret > 0 ? " (inconsistency flag cleared)" : ""); } else { - btrfs_err(fs_info, "qgroup scan failed with %d", err); + btrfs_err(fs_info, "qgroup scan failed with %d", ret); } } @@ -3506,18 +3951,23 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); + return -EINVAL; + } + if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; + ret = -ENOTCONN; } if (ret) @@ -3528,15 +3978,13 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - btrfs_warn(fs_info, - "qgroup rescan is already in progress"); ret = -EINPROGRESS; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; - } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + ret = -ENOTCONN; + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } @@ -3557,7 +4005,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_worker, NULL, NULL); + btrfs_qgroup_rescan_worker, NULL); return 0; } @@ -3584,7 +4032,6 @@ int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) { int ret = 0; - struct btrfs_trans_handle *trans; ret = qgroup_rescan_init(fs_info, 0, 1); if (ret) @@ -3601,16 +4048,10 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_attach_transaction_barrier(fs_info->fs_root); - if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_current_transaction(fs_info->fs_root); + if (ret) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return PTR_ERR(trans); - } else if (trans != ERR_PTR(-ENOENT)) { - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; - } + return ret; } qgroup_rescan_zero_tracking(fs_info); @@ -3746,7 +4187,6 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, */ static int try_flush_qgroup(struct btrfs_root *root) { - struct btrfs_trans_handle *trans; int ret; /* Can't hold an open transaction or we run the risk of deadlocking. */ @@ -3769,17 +4209,9 @@ static int try_flush_qgroup(struct btrfs_root *root) ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - goto out; - } + btrfs_wait_ordered_extents(root, U64_MAX, NULL); - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); @@ -3797,8 +4229,8 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, u64 to_reserve; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || - !is_fstree(root->root_key.objectid) || len == 0) + if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || + !is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -3914,7 +4346,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, BTRFS_QGROUP_RSV_DATA); if (freed_ret) *freed_ret = freed; @@ -3932,8 +4364,11 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) - return 0; + if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { + return clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, NULL); + } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); @@ -3951,7 +4386,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released) *released = changeset.bytes_changed; @@ -4045,8 +4480,8 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->root_key.objectid) || num_bytes == 0) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || + !is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -4082,18 +4517,22 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->root_key.objectid)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || + !is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -4102,8 +4541,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->root_key.objectid)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || + !is_fstree(btrfs_root_id(root))) return; /* @@ -4114,8 +4553,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, - num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -4152,18 +4590,24 @@ out: spin_unlock(&fs_info->qgroup_lock); } +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->root_key.objectid)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || + !is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -4192,7 +4636,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) btrfs_ino(inode), unode->val, unode->aux); } btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } @@ -4266,7 +4710,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > @@ -4376,9 +4820,9 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, int ret = 0; int i; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); @@ -4449,13 +4893,69 @@ out: void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; + xa_destroy(&trans->delayed_refs.dirty_extents); +} + +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) +{ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return; + + if (!is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); +} + +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + const struct btrfs_squota_delta *delta) +{ + int ret; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *qg; + LIST_HEAD(qgroup_list); + u64 root = delta->root; + u64 num_bytes = delta->num_bytes; + const int sign = (delta->is_inc ? 1 : -1); + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return 0; + + if (!is_fstree(root)) + return 0; + + /* If the extent predates enabling quotas, don't count it. */ + if (delta->generation < fs_info->qgroup_enable_gen) + return 0; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, root); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + ret = 0; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qg, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qg->excl += num_bytes * sign; + qg->rfer += num_bytes * sign; + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } + qgroup_iterator_clean(&qgroup_list); + +out: + spin_unlock(&fs_info->qgroup_lock); + return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1203f0632099..c229256d6fd5 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,12 +6,22 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/types.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/kobject.h> -#include "ulist.h" -#include "delayed-ref.h" -#include "misc.h" +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct extent_changeset; +struct btrfs_delayed_extent_op; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_ioctl_quota_ctl_args; +struct btrfs_trans_handle; +struct btrfs_delayed_ref_root; +struct btrfs_inode; /* * Btrfs qgroup overview @@ -101,15 +111,22 @@ * subtree rescan for them. */ -#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) -#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) +/* + * These flags share the flags field of the btrfs_qgroup_status_item with the + * persisted flags defined in btrfs_tree.h. + * + * To minimize the chance of collision with new persisted status flags, these + * count backwards from the MSB. + */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) + +#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) /* * Record a dirty extent, and info qgroup to update quota on it - * TODO: Use kmem cache to alloc it. */ struct btrfs_qgroup_extent_record { - struct rb_node node; u64 bytenr; u64 num_bytes; @@ -229,6 +246,24 @@ struct btrfs_qgroup { * finished. */ struct list_head iterator; + + /* + * For nested iterator usage. + * + * Here we support at most one level of nested iterator calls like: + * + * LIST_HEAD(all_qgroups); + * { + * LIST_HEAD(local_qgroups); + * qgroup_iterator_add(local_qgroups, qg); + * qgroup_iterator_nested_add(all_qgroups, qg); + * do_some_work(local_qgroups); + * qgroup_iterator_clean(local_qgroups); + * } + * do_some_work(all_qgroups); + * qgroup_iterator_nested_clean(all_qgroups); + */ + struct list_head nested_iterator; struct rb_node node; /* tree of qgroups */ /* @@ -244,6 +279,27 @@ struct btrfs_qgroup { struct kobject kobj; }; +/* Glue structure to represent the relations between qgroups. */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +struct btrfs_squota_delta { + /* The fstree root this delta counts against. */ + u64 root; + /* The number of bytes in the extent being counted. */ + u64 num_bytes; + /* The generation the extent was created in. */ + u64 generation; + /* Whether we are using or freeing the extent. */ + bool is_inc; + /* Whether the extent is data or metadata. */ + bool is_data; +}; + static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) { return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); @@ -258,98 +314,44 @@ enum { ENUM_BIT(QGROUP_FREE), }; -int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +enum btrfs_qgroup_mode { + BTRFS_QGROUP_MODE_DISABLED, + BTRFS_QGROUP_MODE_FULL, + BTRFS_QGROUP_MODE_SIMPLE +}; + +enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, + struct btrfs_qgroup_list *prealloc); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); +int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -/* - * Inform qgroup to trace one dirty extent, its info is recorded in @record. - * So qgroup can account it at transaction committing time. - * - * No lock version, caller must acquire delayed ref lock and allocated memory, - * then call btrfs_qgroup_trace_extent_post() after exiting lock context. - * - * Return 0 for success insert - * Return >0 for existing record, caller can free @record safely. - * Error is not possible - */ int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); - -/* - * Post handler after qgroup_trace_extent_nolock(). - * - * NOTE: Current qgroup does the expensive backref walk at transaction - * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming - * new transaction. - * This is designed to allow btrfs_find_all_roots() to get correct new_roots - * result. - * - * However for old_roots there is no need to do backref walk at that time, - * since we search commit roots to walk backref and result will always be - * correct. - * - * Due to the nature of no lock version, we can't do backref there. - * So we must call btrfs_qgroup_trace_extent_post() after exiting - * spinlock context. - * - * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result - * using current root, then we can move all expensive backref walk out of - * transaction committing, but not now as qgroup accounting will be wrong again. - */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord); - -/* - * Inform qgroup to trace one dirty extent, specified by @bytenr and - * @num_bytes. - * So qgroup can account it at commit trans time. - * - * Better encapsulated version, with memory allocation and backref walk for - * commit roots. - * So this can sleep. - * - * Return 0 if the operation is done. - * Return <0 for error, like memory allocation failure or invalid parameter - * (NULL trans) - */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); - -/* - * Inform qgroup to trace all leaf items of data - * - * Return 0 for success - * Return <0 for error(ENOMEM) - */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); -/* - * Inform qgroup to trace a whole subtree, including all its child tree - * blocks and data. - * The root tree block is specified by @root_eb. - * - * Normally used by relocation(tree block swap) and subvolume deletion. - * - * Return 0 for success - * Return <0 for error(ENOMEM or tree search error) - */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); @@ -358,14 +360,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit); + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif @@ -417,20 +423,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, BTRFS_QGROUP_RSV_META_PREALLOC); } -/* - * Per-transaction meta reservation should be all freed at transaction commit - * time - */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); - -/* - * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. - * - * This is called when preallocated meta reservation needs to be used. - * Normally after btrfs_join_transaction() call. - */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); - void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ @@ -447,6 +441,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); -bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); +bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + const struct btrfs_squota_delta *delta); #endif diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c new file mode 100644 index 000000000000..4c859b550f6c --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#include <linux/btrfs_tree.h> +#include "ctree.h" +#include "fs.h" +#include "accessors.h" +#include "transaction.h" +#include "disk-io.h" +#include "raid-stripe-tree.h" +#include "volumes.h" +#include "print-tree.h" + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 found_start; + u64 found_end; + u64 end = start + length; + int slot; + int ret; + + if (!stripe_root) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + break; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + ASSERT(found_start >= start && found_end <= end); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + break; + + start += key.offset; + length -= key.offset; + if (length == 0) + break; + + btrfs_release_path(path); + } + + btrfs_free_path(path); + return ret; +} + +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return ret; +} + +static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key stripe_key; + struct btrfs_root *stripe_root = fs_info->stripe_root; + const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); + struct btrfs_stripe_extent *stripe_extent; + const size_t item_size = struct_size(stripe_extent, strides, num_stripes); + int ret; + + stripe_extent = kzalloc(item_size, GFP_NOFS); + if (!stripe_extent) { + btrfs_abort_transaction(trans, -ENOMEM); + btrfs_end_transaction(trans); + return -ENOMEM; + } + + trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, + num_stripes); + for (int i = 0; i < num_stripes; i++) { + u64 devid = bioc->stripes[i].dev->devid; + u64 physical = bioc->stripes[i].physical; + u64 length = bioc->stripes[i].length; + struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; + + if (length == 0) + length = bioc->size; + + btrfs_set_stack_raid_stride_devid(raid_stride, devid); + btrfs_set_stack_raid_stride_physical(raid_stride, physical); + } + + stripe_key.objectid = bioc->logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = bioc->size; + + ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, + item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); + if (ret) + btrfs_abort_transaction(trans, ret); + + kfree(stripe_extent); + + return ret; +} + +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_io_context *bioc; + int ret; + + if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE)) + return 0; + + list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) { + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) + return ret; + } + + while (!list_empty(&ordered_extent->bioc_list)) { + bioc = list_first_entry(&ordered_extent->bioc_list, + typeof(*bioc), rst_ordered_entry); + list_del(&bioc->rst_ordered_entry); + btrfs_put_bioc(bioc); + } + + return 0; +} + +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe) +{ + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_stripe_extent *stripe_extent; + struct btrfs_key stripe_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + const u64 end = logical + *length; + int num_stripes; + u64 offset; + u64 found_logical; + u64 found_length; + u64 found_end; + int slot; + int ret; + + stripe_key.objectid = logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (stripe->rst_search_commit_root) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + + ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret) { + if (path->slots[0] != 0) + path->slots[0]--; + } + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + found_logical = found_key.objectid; + found_length = found_key.offset; + found_end = found_logical + found_length; + + if (found_logical > end) { + ret = -ENOENT; + goto out; + } + + if (in_range(logical, found_logical, found_length)) + break; + + ret = btrfs_next_item(stripe_root, path); + if (ret) + goto out; + } + + offset = logical - found_logical; + + /* + * If we have a logically contiguous, but physically non-continuous + * range, we need to split the bio. Record the length after which we + * must split the bio. + */ + if (end > found_end) + *length -= end - found_end; + + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; + u64 devid = btrfs_raid_stride_devid(leaf, stride); + u64 physical = btrfs_raid_stride_physical(leaf, stride); + + if (devid != stripe->dev->devid) + continue; + + if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i) + continue; + + stripe->physical = physical + offset; + + trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, + stripe->physical, devid); + + ret = 0; + goto free_path; + } + + /* If we're here, we haven't found the requested devid in the stripe. */ + ret = -ENOENT; +out: + if (ret > 0) + ret = -ENOENT; + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { + btrfs_debug(fs_info, + "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", + logical, logical + *length, stripe->dev->devid, + btrfs_bg_type_to_raid_name(map_type)); + } +free_path: + btrfs_free_path(path); + + return ret; +} diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h new file mode 100644 index 000000000000..1ac1c21aac2f --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#ifndef BTRFS_RAID_STRIPE_TREE_H +#define BTRFS_RAID_STRIPE_TREE_H + +#include <linux/types.h> +#include <uapi/linux/btrfs_tree.h> +#include "fs.h" + +#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID1_MASK | \ + BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10) + +struct btrfs_io_context; +struct btrfs_io_stripe; +struct btrfs_fs_info; +struct btrfs_ordered_extent; +struct btrfs_trans_handle; + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length); +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe); +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent); + +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, + u64 map_type) +{ + u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK; + u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) + return false; + + if (type != BTRFS_BLOCK_GROUP_DATA) + return false; + + if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK) + return true; + + return false; +} + +static inline int btrfs_num_raid_stripes(u32 item_size) +{ + return item_size / sizeof(struct btrfs_raid_stride); +} + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3e014b9370a3..8afadf994b8c 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -41,6 +40,85 @@ #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) +{ + if (unlikely(!bioc)) { + btrfs_crit(fs_info, "bioc=NULL"); + return; + } + btrfs_crit(fs_info, +"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", + bioc->logical, bioc->full_stripe_logical, bioc->size, + bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, + bioc->replace_stripe_src, bioc->num_stripes); + for (int i = 0; i < bioc->num_stripes; i++) { + btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", + i, bioc->stripes[i].dev->devid, + bioc->stripes[i].physical); + } +} + +static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, + const struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + dump_bioc(fs_info, rbio->bioc); + btrfs_crit(fs_info, +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", + rbio->flags, rbio->nr_sectors, rbio->nr_data, + rbio->real_stripes, rbio->stripe_nsectors, + rbio->scrubp, rbio->dbitmap); +} + +#define ASSERT_RBIO(expr, rbio) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "logical=%llu", (logical)); \ + } \ + ASSERT((expr)); \ +}) + /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; @@ -122,8 +200,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -144,7 +221,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -332,12 +409,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { - bio_list_merge(&dest->bio_list, &victim->bio_list); + bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - bio_list_init(&victim->bio_list); } /* @@ -594,8 +670,8 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { - ASSERT(stripe_nr < rbio->real_stripes); - ASSERT(sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); return stripe_nr * rbio->stripe_nsectors + sector_nr; } @@ -875,8 +951,10 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; int index; - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); @@ -918,6 +996,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -955,6 +1040,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -964,7 +1050,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); if (ret < 0) return ret; /* Mapping all sectors */ @@ -979,7 +1065,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages); + rbio->stripe_pages + data_pages, false); if (ret < 0) return ret; @@ -1051,8 +1137,10 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes. */ - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); ASSERT(sector->page); stripe = &rbio->bioc->stripes[stripe_nr]; @@ -1181,6 +1269,26 @@ static inline void bio_list_put(struct bio_list *bio_list) bio_put(bio); } +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || + !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT_RBIO(rbio->real_stripes >= 2, rbio); + ASSERT_RBIO(rbio->nr_data > 0, rbio); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1212,6 +1320,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { @@ -1530,7 +1639,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); if (ret < 0) return ret; @@ -1549,7 +1658,6 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct work_struct work; }; /* @@ -1615,9 +1723,10 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; - ASSERT(orig_logical >= full_stripe_start && - orig_logical + orig_len <= full_stripe_start + - rbio->nr_data * BTRFS_STRIPE_LEN); + ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN, + rbio, orig_logical); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; @@ -2363,7 +2472,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, break; } } - ASSERT(i < rbio->real_stripes); + ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; @@ -2474,6 +2583,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } if (has_qstripe) { + assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); @@ -2528,7 +2638,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * Replace is running and our parity stripe needs to be duplicated to * the target device. Check we have a valid source stripe number. */ - ASSERT(rbio->bioc->replace_stripe_src >= 0); + ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 45e6ff78316f..0d7b4c2fb6ae 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,9 +7,18 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/bio.h> +#include <linux/refcount.h> #include <linux/workqueue.h> #include "volumes.h" +struct page; +struct sector_ptr; +struct btrfs_fs_info; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, @@ -164,7 +173,7 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_data_stripes(const struct map_lookup *map) +static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c2b66d155ef..1c2d7cb1fe6f 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -6,6 +6,12 @@ #ifndef BTRFS_RCU_STRING_H #define BTRFS_RCU_STRING_H +#include <linux/types.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> +#include <linux/printk.h> + struct rcu_string { struct rcu_head rcu; char str[]; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 28ac7995716e..2928abf7eb82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -485,6 +486,13 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } + break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; @@ -652,7 +660,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, } /* - * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * Called when we modify a ref for a bytenr. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and @@ -670,7 +678,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, int ret = 0; bool metadata; u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; + u64 num_bytes = generic_ref->num_bytes; u64 parent = generic_ref->parent; u64 ref_root = 0; u64 owner = 0; @@ -681,11 +689,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.owning_root; + ref_root = generic_ref->ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.owning_root; - owner = generic_ref->data_ref.ino; + ref_root = generic_ref->ref_root; + owner = generic_ref->data_ref.objectid; offset = generic_ref->data_ref.offset; } metadata = owner < BTRFS_FIRST_FREE_OBJECTID; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 855de37719b5..3511e1a5c96b 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -6,7 +6,16 @@ #ifndef BTRFS_REF_VERIFY_H #define BTRFS_REF_VERIFY_H +#include <linux/types.h> +#include <linux/rbtree_types.h> + +struct btrfs_fs_info; +struct btrfs_ref; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + +#include <linux/spinlock.h> + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9f60aa79a8c5..f0824c948cb7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, const u64 olen, int no_time_update) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which @@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -67,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -84,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -116,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -140,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - btrfs_page_clear_checked(fs_info, page, file_offset, block_size); - btrfs_page_set_dirty(fs_info, page, file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, @@ -175,7 +175,7 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); @@ -338,7 +338,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; @@ -617,35 +617,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - u64 range1_end = loff1 + len - 1; - u64 range2_end = loff2 + len - 1; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - swap(range1_end, range2_end); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - swap(range1_end, range2_end); - } - - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); - - btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); - btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); -} - static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) { if (inode1 < inode2) @@ -663,17 +634,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { + const u64 end = dst_loff + len - 1; + struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -691,7 +666,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, + btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; @@ -725,13 +700,15 @@ out: static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { + struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; u64 bs = fs_info->sectorsize; + u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -757,26 +734,29 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * we found the previous extent covering eof and before we * attempted to increment its reference count). */ - ret = btrfs_wait_ordered_range(inode, wb_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, off, inode, destoff, len); + end = destoff + len - 1; + lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination * range, so wait for writeback to complete before truncating pages * from the page cache. This is a rare case. */ - wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned @@ -856,11 +836,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h index ecb309b4dad0..1e291f7d85c4 100644 --- a/fs/btrfs/reflink.h +++ b/fs/btrfs/reflink.h @@ -3,7 +3,9 @@ #ifndef BTRFS_REFLINK_H #define BTRFS_REFLINK_H -#include <linux/fs.h> +#include <linux/types.h> + +struct file; loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 299eac696eb4..79eb984041dd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -111,8 +112,8 @@ struct tree_block { }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; + u8 level; + bool key_ready; }; #define MAX_EXTENTS 128 @@ -122,6 +123,13 @@ struct file_extent_cluster { u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; + u64 owning_root; +}; + +/* Stages of data relocation. */ +enum reloc_stage { + MOVE_DATA_EXTENTS, + UPDATE_DATA_PTRS }; struct reloc_control { @@ -155,16 +163,12 @@ struct reloc_control { u64 search_start; u64 extents_found; - unsigned int stage:8; - unsigned int create_reloc_tree:1; - unsigned int merge_reloc_tree:1; - unsigned int found_file_extent:1; + enum reloc_stage stage; + bool create_reloc_tree; + bool merge_reloc_tree; + bool found_file_extent; }; -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { @@ -180,13 +184,6 @@ static void mark_block_processed(struct reloc_control *rc, node->processed = 1; } - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - /* * walk up backref nodes until reach node presents tree root */ @@ -413,20 +410,19 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( struct btrfs_backref_node *node = NULL; struct btrfs_backref_edge *edge; int ret; - int err = 0; iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -437,10 +433,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( do { ret = btrfs_backref_add_tree_node(trans, cache, path, iter, node_key, cur); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } + edge = list_first_entry_or_null(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); /* @@ -455,19 +450,18 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Finish the upper linkage of newly added edges/nodes */ ret = btrfs_backref_finish_upper_links(cache, node); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (handle_useless_nodes(rc, node)) node = NULL; out: - btrfs_backref_iter_free(iter); + btrfs_free_path(iter->path); + kfree(iter); btrfs_free_path(path); - if (err) { + if (ret) { btrfs_backref_error_cleanup(cache, node); - return ERR_PTR(err); + return ERR_PTR(ret); } ASSERT(!node || !node->detached); ASSERT(list_empty(&cache->useless_node) && @@ -564,7 +558,7 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __must_check __add_reloc_root(struct btrfs_root *root) +static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -690,9 +684,28 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = objectid; - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { u64 commit_root_gen; + /* + * Relocation will wait for cleaner thread, and any half-dropped + * subvolume will be fully cleaned up at mount time. + * So here we shouldn't hit a subvolume with non-zero drop_progress. + * + * If this isn't the case, error out since it can make us attempt to + * drop references for extents that were already dropped before. + */ + if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { + struct btrfs_key cpu_key; + + btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); + btrfs_err(fs_info, + "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", + objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); + ret = -EUCLEAN; + goto fail; + } + /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); @@ -734,7 +747,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); @@ -757,7 +770,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, goto abort; } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; fail: kfree(root_item); @@ -804,7 +817,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, */ if (root->reloc_root) { reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); return 0; } @@ -812,8 +825,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, * We are merging reloc roots, we do not need new reloc trees. Also * reloc trees never need their own reloc tree. */ - if (!rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!trans->reloc_reserved) { @@ -821,7 +833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; clear_rsv = 1; } - reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); if (clear_rsv) trans->block_rsv = rsv; if (IS_ERR(reloc_root)) @@ -888,60 +900,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } /* - * helper to find first cached inode with inode number >= objectid - * in a subvolume - */ -static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - return inode; - } - - objectid = btrfs_ino(entry) + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); - return NULL; -} - -/* * get new location of data */ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, @@ -957,7 +915,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, if (!path) return -ENOMEM; - bytenr -= BTRFS_I(reloc_inode)->index_cnt; + bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) @@ -1001,7 +959,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 parent; u64 bytenr; u64 new_bytenr = 0; @@ -1017,7 +975,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, return 0; /* reloc trees always use full backref */ - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) parent = leaf->start; else parent = 0; @@ -1046,15 +1004,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { if (first) { - inode = find_next_inode(root, key.objectid); + inode = btrfs_find_first_inode(root, key.objectid); first = 0; - } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(BTRFS_I(inode)); - inode = find_next_inode(root, key.objectid); + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); + inode = btrfs_find_first_inode(root, key.objectid); } - if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + if (inode && btrfs_ino(inode) == key.objectid) { struct extent_state *cached_state = NULL; end = key.offset + @@ -1063,16 +1021,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans, fs_info->sectorsize)); WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - &cached_state); - if (!ret) + /* Take mmap lock to serialize with reflinks. */ + if (!down_read_trylock(&inode->i_mmap_lock)) continue; + ret = try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); + if (!ret) { + up_read(&inode->i_mmap_lock); + continue; + } - btrfs_drop_extent_map_range(BTRFS_I(inode), - key.offset, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, &cached_state); + btrfs_drop_extent_map_range(inode, key.offset, end, true); + unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); + up_read(&inode->i_mmap_lock); } } @@ -1090,22 +1052,28 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1115,7 +1083,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(trans, leaf); if (inode) - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); return ret; } @@ -1161,8 +1129,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1314,39 +1282,54 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(trans, path->nodes[level]); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = btrfs_root_id(src); + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, - true); + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = btrfs_root_id(dest); + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + /* We don't know the real owning_root, use 0. */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, - 0, true); + /* We don't know the real owning_root, use 0. */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1454,7 +1437,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 objectid; u64 start, end; u64 ino; @@ -1464,23 +1447,24 @@ static int invalidate_extent_cache(struct btrfs_root *root, struct extent_state *cached_state = NULL; cond_resched(); - iput(inode); + if (inode) + iput(&inode->vfs_inode); if (objectid > max_key->objectid) break; - inode = find_next_inode(root, objectid); + inode = btrfs_find_first_inode(root, objectid); if (!inode) break; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); if (ino > max_key->objectid) { - iput(inode); + iput(&inode->vfs_inode); break; } objectid = ino + 1; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->vfs_inode.i_mode)) continue; if (unlikely(min_key->objectid == ino)) { @@ -1513,9 +1497,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_drop_extent_map_range(inode, start, end, true); + unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1550,7 +1534,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, int ret; /* @root must be a subvolume tree root with a valid reloc tree */ - ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); ASSERT(reloc_root); reloc_root_item = &reloc_root->root_item; @@ -1579,7 +1563,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; @@ -1708,7 +1692,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * btrfs_update_reloc_root() and update our root item * appropriately. */ - reloc_root->last_trans = trans->transid; + btrfs_set_root_last_trans(reloc_root, trans->transid); trans->block_rsv = rc->block_rsv; replaced = 0; @@ -1829,7 +1813,7 @@ again: } } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, @@ -1854,13 +1838,13 @@ again: if (root->reloc_root) { btrfs_err(fs_info, "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - root->reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(root->reloc_root), root->reloc_root->root_key.type, root->reloc_root->root_key.offset, btrfs_root_generation( &root->reloc_root->root_item), - reloc_root->root_key.objectid, + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -1868,8 +1852,8 @@ again: } else { btrfs_err(fs_info, "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -2051,7 +2035,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root; int ret; - if (reloc_root->last_trans == trans->transid) + if (btrfs_get_root_last_trans(reloc_root) == trans->transid) return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -2126,7 +2110,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, return ERR_PTR(-EUCLEAN); } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); @@ -2233,7 +2217,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) fs_root = root; if (next != node) @@ -2249,9 +2233,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) return fs_root; } -static noinline_for_stack -u64 calcu_metadata_size(struct reloc_control *rc, - struct btrfs_backref_node *node, int reserve) +static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, + struct btrfs_backref_node *node) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *next = node; @@ -2260,12 +2243,12 @@ u64 calcu_metadata_size(struct reloc_control *rc, u64 num_bytes = 0; int index = 0; - BUG_ON(reserve && node->processed); + BUG_ON(node->processed); while (next) { cond_resched(); while (1) { - if (next->processed && (reserve || next != node)) + if (next->processed) break; num_bytes += fs_info->nodesize; @@ -2293,7 +2276,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, int ret; u64 tmp; - num_bytes = calcu_metadata_size(rc, node, 1) * 2; + num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2356,8 +2339,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { - struct btrfs_ref ref = { 0 }; - cond_resched(); upper = edge->node[UPPER]; @@ -2445,18 +2426,23 @@ static int do_relocation(struct btrfs_trans_handle *trans, */ ASSERT(node->eb == eb); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = node->eb->start, + .num_bytes = blocksize, + .parent = upper->eb->start, + .owning_root = btrfs_header_owner(upper->eb), + .ref_root = btrfs_header_owner(upper->eb), + }; + btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); btrfs_mark_buffer_dirty(trans, upper->eb); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - node->eb->start, blocksize, - upper->eb->start); btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb), - root->root_key.objectid, false); + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2565,7 +2551,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2592,7 +2578,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); - block->key_ready = 1; + block->key_ready = true; return 0; } @@ -2708,12 +2694,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct tree_block *block; struct tree_block *next; - int ret; - int err = 0; + int ret = 0; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_free_blocks; } @@ -2728,8 +2713,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Get first keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { - err = get_tree_block_key(fs_info, block); - if (err) + ret = get_tree_block_key(fs_info, block); + if (ret) goto out_free_path; } } @@ -2739,35 +2724,33 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { - err = PTR_ERR(node); + ret = PTR_ERR(node); goto out; } ret = relocate_tree_block(trans, rc, node, &block->key, path); - if (ret < 0) { - err = ret; + if (ret < 0) break; - } } out: - err = finish_pending_nodes(trans, rc, path, err); + ret = finish_pending_nodes(trans, rc, path, ret); out_free_path: btrfs_free_path(path); out_free_blocks: free_block_list(blocks); - return err; + return ret; } -static noinline_for_stack int prealloc_file_extent_cluster( - struct btrfs_inode *inode, - const struct file_extent_cluster *cluster) +static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) { + const struct file_extent_cluster *cluster = &rc->cluster; + struct btrfs_inode *inode = BTRFS_I(rc->data_inode); u64 alloc_hint = 0; u64 start; u64 end; - u64 offset = inode->index_cnt; + u64 offset = inode->reloc_block_group_start; u64 num_bytes; int nr; int ret = 0; @@ -2782,7 +2765,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() - * will skip the read, and relocate_one_page() will later writeback + * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). @@ -2791,7 +2774,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - struct page *page; + struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); @@ -2822,16 +2805,16 @@ static noinline_for_stack int prealloc_file_extent_cluster( clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); - page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ - if (page) { - btrfs_subpage_clear_uptodate(fs_info, page, i_size, + if (!IS_ERR(folio)) { + btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } @@ -2869,11 +2852,14 @@ static noinline_for_stack int prealloc_file_extent_cluster( return ret; } -static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, - u64 start, u64 end, u64 block_start) +static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) { + struct btrfs_inode *inode = BTRFS_I(rc->data_inode); struct extent_map *em; struct extent_state *cached_state = NULL; + u64 offset = inode->reloc_block_group_start; + u64 start = rc->cluster.start - offset; + u64 end = rc->cluster.end - offset; int ret = 0; em = alloc_extent_map(); @@ -2882,13 +2868,14 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->start = start; em->len = end + 1 - start; - em->block_len = em->len; - em->block_start = block_start; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); - ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + em->disk_bytenr = rc->cluster.start; + em->disk_num_bytes = em->len; + em->ram_bytes = em->len; + em->flags |= EXTENT_FLAG_PINNED; + + lock_extent(&inode->io_tree, start, end, &cached_state); + ret = btrfs_replace_extent_map_range(inode, em, false); + unlock_extent(&inode->io_tree, start, end, &cached_state); free_extent_map(em); return ret; @@ -2916,68 +2903,91 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, return cluster->boundary[cluster_nr + 1] - 1; } -static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - const struct file_extent_cluster *cluster, - int *cluster_nr, unsigned long page_index) +static int relocate_one_folio(struct reloc_control *rc, + struct file_ra_state *ra, + int *cluster_nr, unsigned long index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 offset = BTRFS_I(inode)->index_cnt; + const struct file_extent_cluster *cluster = &rc->cluster; + struct inode *inode = rc->data_inode; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + u64 offset = BTRFS_I(inode)->reloc_block_group_start; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - struct page *page; - u64 page_start; - u64 page_end; + struct folio *folio; + u64 folio_start; + u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); + + ASSERT(index <= last_index); +again: + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { - ASSERT(page_index <= last_index); - page = find_lock_page(inode->i_mapping, page_index); - if (!page) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - page_index, last_index + 1 - page_index); - page = find_or_create_page(inode->i_mapping, page_index, mask); - if (!page) - return -ENOMEM; + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) + return PTR_ERR(folio); } - if (PageReadahead(page)) + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, - page_folio(page), page_index, - last_index + 1 - page_index); + folio, last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { ret = -EIO; - goto release_page; + goto release_folio; + } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; } } /* - * We could have lost page private when we dropped the lock to read the - * page above, make sure we set_page_extent_mapped here so we have any + * We could have lost folio private when we dropped the lock to read the + * folio above, make sure we set_page_extent_mapped here so we have any * of the subpage blocksize stuff we need in place. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) - goto release_page; + goto release_folio; - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + folio_start = folio_pos(folio); + folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start - * inside the page. + * inside the folio. */ - cur = max(page_start, cluster->boundary[*cluster_nr] - offset); - while (cur <= page_end) { + cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; - u64 clamped_start = max(page_start, extent_start); - u64 clamped_end = min(page_end, extent_end); + u64 clamped_start = max(folio_start, extent_start); + u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ @@ -2985,7 +2995,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, clamped_len, false); if (ret) - goto release_page; + goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, @@ -3001,19 +3011,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); - goto release_page; + goto release_folio; } - btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* - * Set the boundary if it's inside the page. + * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, - page_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -3036,8 +3045,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, break; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); @@ -3045,16 +3054,17 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -ECANCELED; return ret; -release_page: - unlock_page(page); - put_page(page); +release_folio: + folio_unlock(folio); + folio_put(folio); return ret; } -static int relocate_file_extent_cluster(struct inode *inode, - const struct file_extent_cluster *cluster) +static int relocate_file_extent_cluster(struct reloc_control *rc) { - u64 offset = BTRFS_I(inode)->index_cnt; + struct inode *inode = rc->data_inode; + const struct file_extent_cluster *cluster = &rc->cluster; + u64 offset = BTRFS_I(inode)->reloc_block_group_start; unsigned long index; unsigned long last_index; struct file_ra_state *ra; @@ -3068,21 +3078,20 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); + ret = prealloc_file_extent_cluster(rc); if (ret) goto out; file_ra_state_init(ra, inode->i_mapping); - ret = setup_relocation_extent_mapping(inode, cluster->start - offset, - cluster->end - offset, cluster->start); + ret = setup_relocation_extent_mapping(rc); if (ret) goto out; last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) - ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); + ret = relocate_one_folio(rc, ra, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3090,21 +3099,53 @@ out: return ret; } -static noinline_for_stack int relocate_data_extent(struct inode *inode, - const struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) +static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, + const struct btrfs_key *extent_key) { + struct inode *inode = rc->data_inode; + struct file_extent_cluster *cluster = &rc->cluster; int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { - ret = relocate_file_extent_cluster(inode, cluster); + ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; } - if (!cluster->nr) + /* + * Under simple quotas, we set root->relocation_src_root when we find + * the extent. If adjacent extents have different owners, we can't merge + * them while relocating. Handle this by storing the owning root that + * started a cluster and if we see an extent from a different root break + * cluster formation (just like the above case of non-adjacent extents). + * + * Without simple quotas, relocation_src_root is always 0, so we should + * never see a mismatch, and it should have no effect on relocation + * clusters. + */ + if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { + u64 tmp = root->relocation_src_root; + + /* + * root->relocation_src_root is the state that actually affects + * the preallocation we do here, so set it to the root owning + * the cluster we need to relocate. + */ + root->relocation_src_root = cluster->owning_root; + ret = relocate_file_extent_cluster(rc); + if (ret) + return ret; + cluster->nr = 0; + /* And reset it back for the current extent's owning root. */ + root->relocation_src_root = tmp; + } + + if (!cluster->nr) { cluster->start = extent_key->objectid; + cluster->owning_root = root->relocation_src_root; + } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; @@ -3112,7 +3153,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode, cluster->nr++; if (cluster->nr >= MAX_EXTENTS) { - ret = relocate_file_extent_cluster(inode, cluster); + ret = relocate_file_extent_cluster(rc); if (ret) return ret; cluster->nr = 0; @@ -3210,7 +3251,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; - block->key_ready = 0; + block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); @@ -3306,7 +3347,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, if (inode) goto truncate; - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget(ino, root); if (IS_ERR(inode)) return -ENOENT; @@ -3553,7 +3594,7 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->create_reloc_tree = 1; + rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); @@ -3631,6 +3672,21 @@ restart: struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); + /* + * If we are relocating a simple quota owned extent item, we + * need to note the owner on the reloc data root so that when + * we allocate the replacement item, we can attribute it to the + * correct eventual owner (rather than the reloc data root). + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; + u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, + path->nodes[0], + path->slots[0]); + + root->relocation_src_root = owning_root_id; + } + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && @@ -3663,9 +3719,8 @@ restart: if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, - &key, &rc->cluster); + rc->found_file_extent = true; + ret = relocate_data_extent(rc, &key); if (ret < 0) { err = ret; break; @@ -3694,13 +3749,12 @@ restart: } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, - &rc->cluster); + ret = relocate_file_extent_cluster(rc); if (ret < 0) err = ret; } - rc->create_reloc_tree = 0; + rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); @@ -3718,7 +3772,7 @@ restart: merge_reloc_roots(rc); - rc->merge_reloc_tree = 0; + rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); @@ -3811,7 +3865,7 @@ static noinline_for_stack struct inode *create_reloc_inode( struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; - int err = 0; + int ret = 0; root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); @@ -3820,31 +3874,31 @@ static noinline_for_stack struct inode *create_reloc_inode( return ERR_CAST(trans); } - err = btrfs_get_free_objectid(root, &objectid); - if (err) + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) goto out; - err = __insert_orphan_inode(trans, root, objectid); - if (err) + ret = __insert_orphan_inode(trans, root, objectid); + if (ret) goto out; - inode = btrfs_iget(fs_info->sb, objectid, root); + inode = btrfs_iget(objectid, root); if (IS_ERR(inode)) { delete_orphan_inode(trans, root, objectid); - err = PTR_ERR(inode); + ret = PTR_ERR(inode); inode = NULL; goto out; } - BTRFS_I(inode)->index_cnt = group->start; + BTRFS_I(inode)->reloc_block_group_start = group->start; - err = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err) { + if (ret) { iput(inode); - inode = ERR_PTR(err); + inode = ERR_PTR(ret); } return inode; } @@ -3900,8 +3954,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); - btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); - mapping_tree_init(&rc->reloc_root_tree); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); + rc->reloc_root_tree.rb_root = RB_ROOT; + spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3921,19 +3976,17 @@ static void free_reloc_control(struct reloc_control *rc) /* * Print the block group being relocated */ -static void describe_relocation(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group) +static void describe_relocation(struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); - btrfs_info(fs_info, - "relocating block group %llu flags %s", + btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", block_group->start, buf); } -static const char *stage_to_string(int stage) +static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; @@ -4037,19 +4090,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - describe_relocation(fs_info, rc->block_group); + describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, U64_MAX, - rc->block_group->start, - rc->block_group->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); while (1) { - int finishes_stage; + enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); @@ -4068,7 +4119,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - ret = btrfs_wait_ordered_range(rc->data_inode, 0, + ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, (u64)-1); if (ret) err = ret; @@ -4140,8 +4191,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) struct extent_buffer *leaf; struct reloc_control *rc = NULL; struct btrfs_trans_handle *trans; - int ret; - int err = 0; + int ret2; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -4155,15 +4206,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) while (1) { ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) { if (path->slots[0] == 0) break; path->slots[0]--; } + ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); @@ -4174,7 +4224,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { - err = PTR_ERR(reloc_root); + ret = PTR_ERR(reloc_root); goto out; } @@ -4186,15 +4236,12 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); - if (ret != -ENOENT) { - err = ret; + if (ret != -ENOENT) goto out; - } ret = mark_garbage_root(reloc_root); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } + ret = 0; } else { btrfs_put_root(fs_root); } @@ -4212,15 +4259,13 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc = alloc_reloc_control(fs_info); if (!rc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } ret = reloc_chunk_start(fs_info); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end; - } rc->extent_root = btrfs_extent_root(fs_info, 0); @@ -4228,11 +4273,11 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_unset; } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, @@ -4248,15 +4293,15 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); + ret = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_end_transaction(trans); goto out_unset; } - err = __add_reloc_root(reloc_root); - ASSERT(err != -EEXIST); - if (err) { + ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); + if (ret) { list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_put_root(fs_root); btrfs_end_transaction(trans); @@ -4266,8 +4311,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_root); } - err = btrfs_commit_transaction(trans); - if (err) + ret = btrfs_commit_transaction(trans); + if (ret) goto out_unset; merge_reloc_roots(rc); @@ -4276,14 +4321,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_clean; } - err = btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); out_clean: - ret = clean_dirty_subvols(rc); - if (ret < 0 && !err) - err = ret; + ret2 = clean_dirty_subvols(rc); + if (ret2 < 0 && !ret) + ret = ret2; out_unset: unset_reloc_control(rc); out_end: @@ -4294,14 +4339,14 @@ out: btrfs_free_path(path); - if (err == 0) { + if (ret == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); - err = btrfs_orphan_cleanup(fs_root); + ret = btrfs_orphan_cleanup(fs_root); btrfs_put_root(fs_root); } - return err; + return ret; } /* @@ -4312,18 +4357,20 @@ out: */ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); int ret; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + ordered->num_bytes - 1, - &list, 0, false); - if (ret) + &list, false); + if (ret < 0) { + btrfs_mark_ordered_extent_error(ordered); return ret; + } while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = @@ -4373,13 +4420,22 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, btrfs_root_last_snapshot(&root->root_item)) first_cow = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && - rc->create_reloc_tree) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); @@ -4467,8 +4523,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, } new_root = pending->snap; - reloc_root = create_reloc_root(trans, root->reloc_root, - new_root->root_key.objectid); + reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5fb60f2deb53..788c86d8633a 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -3,6 +3,15 @@ #ifndef BTRFS_RELOCATION_H #define BTRFS_RELOCATION_H +#include <linux/types.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ordered_extent; +struct btrfs_pending_snapshot; + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index aac18f620de4..33962671a96c 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -10,7 +10,6 @@ #include "messages.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "qgroup.h" #include "space-info.h" #include "accessors.h" @@ -51,7 +50,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * btrfs_find_root - lookup the root by the key. + * Lookup the root by the key. + * * root: the root of the root tree * search_key: the key to search * path: the path we search @@ -81,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, if (ret > 0) goto out; } else { - BUG_ON(ret == 0); /* Logical error */ + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of the valid range. + */ + if (ret == 0) { + ret = -EUCLEAN; + goto out; + } if (path->slots[0] == 0) goto out; path->slots[0]--; @@ -141,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret > 0) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, - root->root_key.objectid); + key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; @@ -322,8 +328,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; - - BUG_ON(ret != 0); + if (ret != 0) { + /* The root must exist but we did not find it by the key. */ + ret = -EUCLEAN; + goto out; + } ret = btrfs_del_item(trans, root, path); out: @@ -485,7 +494,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, } /* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * Reserve space for subvolume operation. + * * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation @@ -508,7 +518,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + if (btrfs_qgroup_enabled(fs_info)) { /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index cce808b44cc0..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,6 +3,18 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +#include <linux/types.h> + +struct fscrypt_str; +struct extent_buffer; +struct btrfs_key; +struct btrfs_root; +struct btrfs_root_item; +struct btrfs_path; +struct btrfs_fs_info; +struct btrfs_block_rsv; +struct btrfs_trans_handle; + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -16,10 +28,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a2d91d9f8a10..3fcc7c092c5e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,7 +16,6 @@ #include "backref.h" #include "extent_io.h" #include "dev-replace.h" -#include "check-integrity.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -24,6 +23,7 @@ #include "accessors.h" #include "file-item.h" #include "scrub.h" +#include "raid-stripe-tree.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -43,7 +43,7 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This detemines how many stripes would be submitted in one go, + * This determines how many stripes would be submitted in one go, * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ #define SCRUB_STRIPES_PER_GROUP 8 @@ -153,12 +153,14 @@ struct scrub_stripe { unsigned int init_nr_io_errors; unsigned int init_nr_csum_errors; unsigned int init_nr_meta_errors; + unsigned int init_nr_meta_gen_errors; /* * The following error bitmaps are all for the current status. * Every time we submit a new read, these bitmaps may be updated. * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; + * error_bitmap = io_error_bitmap | csum_error_bitmap | + * meta_error_bitmap | meta_generation_bitmap; * * IO and csum errors can happen for both metadata and data. */ @@ -166,6 +168,7 @@ struct scrub_stripe { unsigned long io_error_bitmap; unsigned long csum_error_bitmap; unsigned long meta_error_bitmap; + unsigned long meta_gen_error_bitmap; /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; @@ -192,7 +195,6 @@ struct scrub_ctx { int cur_stripe; atomic_t cancel_req; int readonly; - int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -262,7 +264,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); if (ret < 0) goto error; @@ -617,7 +619,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", @@ -673,7 +675,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", @@ -685,6 +687,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -710,7 +713,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) /* Metadata, verify the full tree block. */ if (sector->is_metadata) { /* - * Check if the tree block crosses the stripe boudary. If + * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a * warning. * @@ -839,7 +842,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -858,7 +861,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -884,7 +887,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* * Init needed infos for error reporting. * - * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { @@ -897,7 +900,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num >= 1); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, - NULL, NULL, 1); + NULL, NULL); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -973,8 +976,22 @@ skip: if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < stripe->init_nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < stripe->init_nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < stripe->init_nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; @@ -983,7 +1000,8 @@ skip: sctx->stat.no_csum += nr_nodatacsum_sectors; sctx->stat.read_errors += stripe->init_nr_io_errors; sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.verify_errors += stripe->init_nr_meta_errors + + stripe->init_nr_meta_gen_errors; sctx->stat.uncorrectable_errors += bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; @@ -1029,6 +1047,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) stripe->nr_sectors); stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap, + stripe->nr_sectors); if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) goto out; @@ -1143,6 +1163,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1290,7 +1313,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset, + struct btrfs_chunk_map *map, u64 *offset, u64 *stripe_start) { int i; @@ -1391,8 +1414,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + btrfs_release_path(path); + return -EUCLEAN; + } - ASSERT(ret > 0); /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. @@ -1419,14 +1449,11 @@ search_forward: if (ret > 0) break; next: - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(extent_root, path); - if (ret) { - /* Either no more item or fatal error */ - btrfs_release_path(path); - return ret; - } + ret = btrfs_next_item(extent_root, path); + if (ret) { + /* Either no more items or a fatal error. */ + btrfs_release_path(path); + return ret; } } btrfs_release_path(path); @@ -1505,10 +1532,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) stripe->init_nr_io_errors = 0; stripe->init_nr_csum_errors = 0; stripe->init_nr_meta_errors = 0; + stripe->init_nr_meta_gen_errors = 0; stripe->error_bitmap = 0; stripe->io_error_bitmap = 0; stripe->csum_error_bitmap = 0; stripe->meta_error_bitmap = 0; + stripe->meta_gen_error_bitmap = 0; } /* @@ -1538,6 +1567,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); @@ -1645,20 +1678,105 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static u32 stripe_length(const struct scrub_stripe *stripe) +{ + ASSERT(stripe->bg); + + return min(BTRFS_STRIPE_LEN, + stripe->bg->start + stripe->bg->length - stripe->logical); +} + +static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; + u64 stripe_len = BTRFS_STRIPE_LEN; + int mirror = stripe->mirror_num; + int i; + + atomic_inc(&stripe->pending_io); + + for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && + ((i > 0 && + !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + bbio->bio.bi_iter.bi_size >= stripe_len)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bbio(bbio, mirror); + bbio = NULL; + } + + if (!bbio) { + struct btrfs_io_stripe io_stripe = {}; + struct btrfs_io_context *bioc = NULL; + const u64 logical = stripe->logical + + (i << fs_info->sectorsize_bits); + int err; + + io_stripe.rst_search_commit_root = true; + stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; + /* + * For RST cases, we need to manually split the bbio to + * follow the RST boundary. + */ + err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &stripe_len, &bioc, &io_stripe, &mirror); + btrfs_put_bioc(bioc); + if (err < 0) { + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + continue; + } + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + } + + __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + } + + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bbio(bbio, mirror); + } + + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } +} + static void scrub_submit_initial_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; - unsigned int nr_sectors = min_t(u64, BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); ASSERT(stripe->mirror_num > 0); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { + scrub_submit_extent_sector_read(sctx, stripe); + return; + } + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); @@ -1688,7 +1806,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) @@ -1761,7 +1879,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) if (sctx->is_dev_replace) { /* * For dev-replace, if we know there is something wrong with - * metadata, we should immedately abort. + * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { if (stripe_has_metadata_error(&sctx->stripes[i])) { @@ -1787,6 +1905,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) stripe = &sctx->stripes[i]; wait_scrub_stripe_io(stripe); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = stripe->physical + stripe_length(stripe); + spin_unlock(&sctx->stat_lock); scrub_reset_stripe(stripe); } out: @@ -1843,7 +1964,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 full_stripe_start) { DECLARE_COMPLETION_ONSTACK(io_done); @@ -1969,7 +2090,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL, 1); + &length, &bioc, NULL, NULL); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -2012,7 +2133,7 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2055,7 +2176,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ + spin_lock(&sctx->stat_lock); sctx->stat.last_physical = physical + logical_length; + spin_unlock(&sctx->stat_lock); ret = 0; break; } @@ -2073,7 +2196,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Calculate the full stripe length for simple stripe based profiles */ -static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2082,7 +2205,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) } /* Get the logical bytenr for the stripe */ -static u64 simple_stripe_get_logical(struct map_lookup *map, +static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, struct btrfs_block_group *bg, int stripe_index) { @@ -2099,7 +2222,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, } /* Get the mirror number for the stripe */ -static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2111,7 +2234,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) static int scrub_simple_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct btrfs_device *device, int stripe_index) { @@ -2144,18 +2267,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct extent_map *em, + struct btrfs_chunk_map *map, struct btrfs_device *scrub_dev, int stripe_index) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; int ret2; u64 physical = map->stripes[stripe_index].physical; - const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 dev_stripe_len = btrfs_calc_stripe_length(map); const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; @@ -2253,6 +2375,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, stripe_logical += chunk_logical; ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, map, stripe_logical); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, + physical_end); + spin_unlock(&sctx->stat_lock); if (ret) goto out; goto next; @@ -2318,17 +2444,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; int i; int ret = 0; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, bg->start, bg->length); - read_unlock(&map_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); + if (!map) { /* * Might have been an unused block group deleted by the cleaner * kthread or relocation. @@ -2340,22 +2461,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - if (em->start != bg->start) + if (map->start != bg->start) goto out; - if (em->len < dev_extent_len) + if (map->chunk_len < dev_extent_len) goto out; - map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, em, scrub_dev, i); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i); if (ret) goto out; } } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2364,19 +2484,15 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root, struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_trans_handle *trans; if (!btrfs_is_zoned(fs_info)) return 0; btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(root); } static noinline_for_stack @@ -2607,8 +2723,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ if (sctx->is_dev_replace) { btrfs_wait_nocow_writers(cache); - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, - cache->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); } scrub_pause_off(fs_info); @@ -2736,7 +2851,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else - gen = fs_info->last_trans_committed; + gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index 7639103ebf9d..f0df597b75c7 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -3,6 +3,12 @@ #ifndef BTRFS_SCRUB_H #define BTRFS_SCRUB_H +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_scrub_progress; + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index aa1e6d88a72c..c843b4aefb8a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,7 +25,6 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" -#include "xattr.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" @@ -63,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -347,8 +346,10 @@ struct name_cache_entry { u64 parent_gen; int ret; int need_later_update; + /* Name length without NUL terminator. */ int name_len; - char name[]; + /* Not NUL terminated. */ + char name[] __counted_by(name_len) __nonstring; }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ @@ -393,9 +394,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, btrfs_err(sctx->send_root->fs_info, "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", result_string, what, sctx->cmp_key->objectid, - sctx->send_root->root_key.objectid, - (sctx->parent_root ? - sctx->parent_root->root_key.objectid : 0)); + btrfs_root_id(sctx->send_root), + (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0)); } __maybe_unused @@ -487,10 +487,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; path_len = p->end - p->start; old_buf_len = p->buf_len; @@ -801,7 +799,7 @@ static int send_cmd(struct send_ctx *sctx) put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); put_unaligned_le32(0, &hdr->crc); - crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -1138,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; @@ -1317,9 +1315,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; const struct clone_root *cr = elt; - if (root < cr->root->root_key.objectid) + if (root < btrfs_root_id(cr->root)) return -1; - if (root > cr->root->root_key.objectid) + if (root > btrfs_root_id(cr->root)) return 1; return 0; } @@ -1329,9 +1327,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) const struct clone_root *cr1 = e1; const struct clone_root *cr2 = e2; - if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) return -1; - if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) return 1; return 0; } @@ -1419,7 +1417,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + if (sctx->backref_cache.size == 0) return false; /* @@ -1517,7 +1515,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } @@ -1779,7 +1777,7 @@ static int read_symlink(struct btrfs_root *root, */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", - ino, root->root_key.objectid); + ino, btrfs_root_id(root)); ret = -EIO; goto out; } @@ -2390,7 +2388,7 @@ out_cache: /* * Store the result of the lookup in the name cache. */ - nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL); + nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); if (!nce) { ret = -ENOMEM; goto out; @@ -2402,7 +2400,7 @@ out_cache: nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); nce->ret = ret; - strcpy(nce->name, dest->start); + memcpy(nce->name, dest->start, nce->name_len); if (ino < sctx->send_progress) nce->need_later_update = 0; @@ -2533,7 +2531,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->root_key.objectid; + key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2549,7 +2547,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->root_key.objectid) { + key.objectid != btrfs_root_id(send_root)) { ret = -ENOENT; goto out; } @@ -2822,8 +2820,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) static int trim_dir_utimes_cache(struct send_ctx *sctx) { - while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > - SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; @@ -5191,11 +5188,10 @@ out: static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct inode *inode; struct fs_path *p; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5276,10 +5272,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct page *page; + struct folio *folio; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); + struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; ret = put_data_header(sctx, len); @@ -5292,44 +5289,50 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(sctx->cur_inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(sctx->cur_inode->i_mapping, +again: + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, &sctx->ra, NULL, index, last_index + 1 - index); - page = find_or_create_page(sctx->cur_inode->i_mapping, - index, GFP_KERNEL); - if (!page) { - ret = -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; } } - if (PageReadahead(page)) - page_cache_async_readahead(sctx->cur_inode->i_mapping, - &sctx->ra, NULL, page_folio(page), - index, last_index + 1 - index); + WARN_ON(folio_order(folio)); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); + if (folio_test_readahead(folio)) + page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, + last_index + 1 - index); + + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", - page_offset(page), sctx->cur_ino, - sctx->send_root->root_key.objectid); - put_page(page); + folio_pos(folio), sctx->cur_ino, + btrfs_root_id(sctx->send_root)); + folio_put(folio); ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } - memcpy_from_page(sctx->send_buf + sctx->send_size, page, - pg_offset, cur_len); - unlock_page(page); - put_page(page); + memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, + pg_offset, cur_len); + folio_unlock(folio); + folio_put(folio); index++; pg_offset = 0; len -= cur_len; @@ -5390,7 +5393,7 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->root_key.objectid, + offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); p = fs_path_alloc(); @@ -5552,7 +5555,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5619,7 +5622,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, u32 crc; int ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5678,7 +5681,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT)); @@ -5688,8 +5691,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, hdr = (struct btrfs_cmd_header *)sctx->send_buf; hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); hdr->crc = 0; - crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); - crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + crc = crc32c(0, sctx->send_buf, sctx->send_size); + crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -5748,7 +5751,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_inode == NULL) { struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); if (IS_ERR(sctx->cur_inode)) { int err = PTR_ERR(sctx->cur_inode); @@ -6524,21 +6527,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; - if (sctx->cur_inode_last_extent == (u64)-1) { - ret = get_last_extent(sctx, key->offset - 1); - if (ret) - return ret; - } - - if (path->slots[0] == 0 && - sctx->cur_inode_last_extent < key->offset) { - /* - * We might have skipped entire leafs that contained only - * file extent items for our current inode. These leafs have - * a generation number smaller (older) than the one in the - * current leaf and the leaf our last extent came from, and - * are located between these 2 leafs. - */ + /* + * Get last extent's end offset (exclusive) if we haven't determined it + * yet (we're processing the first file extent item that is new), or if + * we're at the first slot of a leaf and the last extent's end is less + * than the current extent's offset, because we might have skipped + * entire leaves that contained only file extent items for our current + * inode. These leaves have a generation number smaller (older) than the + * one in the current leaf and the leaf our last extent came from, and + * are located between these 2 leaves. + */ + if ((sctx->cur_inode_last_extent == (u64)-1) || + (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; @@ -7194,13 +7194,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) @@ -7389,7 +7387,7 @@ static int search_key_again(const struct send_ctx *sctx, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", key->objectid, key->type, key->offset, (root == sctx->parent_root ? "parent" : "send"), - root->root_key.objectid, path->lowest_level, + btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); return -EUCLEAN; } @@ -8050,34 +8048,18 @@ out: */ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) { - int i; - struct btrfs_trans_handle *trans = NULL; - -again: - if (sctx->parent_root && - sctx->parent_root->node != sctx->parent_root->commit_root) - goto commit_trans; - - for (i = 0; i < sctx->clone_roots_cnt; i++) - if (sctx->clone_roots[i].root->node != - sctx->clone_roots[i].root->commit_root) - goto commit_trans; + struct btrfs_root *root = sctx->parent_root; - if (trans) - return btrfs_end_transaction(trans); + if (root && root->node != root->commit_root) + return btrfs_commit_current_transaction(root); - return 0; - -commit_trans: - /* Use any root, all fs roots will get their commit roots updated. */ - if (!trans) { - trans = btrfs_join_transaction(sctx->send_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - goto again; + for (int i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + if (root->node != root->commit_root) + return btrfs_commit_current_transaction(root); } - return btrfs_commit_transaction(trans); + return 0; } /* @@ -8098,7 +8080,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; - btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); } for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -8106,7 +8088,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; - btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + btrfs_wait_ordered_extents(root, U64_MAX, NULL); } return 0; @@ -8123,7 +8105,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) if (root->send_in_progress < 0) btrfs_err(root->fs_info, "send_in_progress unbalanced %d root %llu", - root->send_in_progress, root->root_key.objectid); + root->send_in_progress, btrfs_root_id(root)); spin_unlock(&root->root_item_lock); } @@ -8131,13 +8113,13 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) { btrfs_warn_rl(root->fs_info, "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", - root->root_key.objectid, root->dedupe_in_progress); + btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(inode)->root; + struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4f5509cb1803..b07f4aa66878 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -8,6 +8,11 @@ #define BTRFS_SEND_H #include <linux/types.h> +#include <linux/sizes.h> +#include <linux/align.h> + +struct btrfs_inode; +struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ @@ -25,9 +30,6 @@ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K #define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) -struct inode; -struct btrfs_ioctl_send_args; - enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, @@ -180,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 581bdd709ee0..d5a9cd8a4fd8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include "linux/spinlock.h" +#include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -9,7 +11,6 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -162,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -191,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -233,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; + space_info->fs_info = info; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -341,12 +345,35 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } +static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo; + u64 data_chunk_size; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no + * more than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) + * as it is. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + if (btrfs_is_zoned(fs_info)) + return data_sinfo->chunk_size; + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + return min_t(u64, data_chunk_size, SZ_1G); +} + static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; + u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) @@ -364,6 +391,27 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); + if (avail == 0) + return 0; + + data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* + * Since data allocations immediately use block groups as part of the + * reservation, because we assume that data reservations will == actual + * usage, we could potentially overcommit and then immediately have that + * available space used by a data allocation, which could put us in a + * bind when we get close to filling the file system. + * + * To handle this simply remove the data_chunk_size from the available + * space. If we are relatively empty this won't affect our ability to + * overcommit much, and if we're very close to full it'll keep us from + * getting into a position where we've given ourselves very little + * metadata wiggle room. + */ + if (avail <= data_chunk_size) + return 0; + avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to @@ -374,11 +422,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -483,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -555,20 +614,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); - u64 nr; - - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - /* * shrink metadata reservation for delalloc */ @@ -668,7 +713,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, NULL); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -748,10 +793,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_delayed_refs_nr(fs_info, num_bytes); + btrfs_run_delayed_refs(trans, num_bytes); else - nr = 0; - btrfs_run_delayed_refs(trans, nr); + btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: @@ -788,14 +832,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, * because that does not wait for a transaction to fully commit * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - break; - } - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); break; default: ret = -ENOSPC; @@ -807,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -834,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -977,7 +1013,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, } /* - * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * We've exhausted our flushing, start failing tickets. + * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * @@ -1741,7 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem - * @block_rsv: block_rsv we're allocating for + * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * @@ -1753,21 +1790,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); + space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } @@ -1850,3 +1885,202 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * A reasonable buffer for unallocated space is 10 data block_groups. + * If we claw this back repeatedly, we can still achieve efficient + * utilization when near full, and not do too much reclaim while + * always maintaining a solid buffer for workloads that quickly + * allocate and pressure the unallocated space. + */ +static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) +{ + u64 chunk_sz = calc_effective_data_chunk_size(fs_info); + + return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz; +} + +/* + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * Furthermore, we want to avoid doing too much reclaim even if there are good + * candidates. This is because the allocator is pretty good at filling up the + * holes with writes. So we want to do just enough reclaim to try and stay + * safe from running out of unallocated space but not be wasteful about it. + * + * Therefore, the dynamic reclaim threshold is calculated as follows: + * - calculate a target unallocated amount of 5 block group sized chunks + * - ratchet up the intensity of reclaim depending on how far we are from + * that target by using a formula of unalloc / target to set the threshold. + * + * Typically with 10 block groups as the target, the discrete values this comes + * out to are 0, 10, 20, ... , 80, 90, and 99. + */ +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 target = calc_unalloc_target(fs_info); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + u64 want = target > unalloc ? target - unalloc : 0; + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* If we have no unused space, don't bother, it won't work anyway. */ + if (unused < data_chunk_size) + return 0; + + /* Cast to int is OK because want <= target. */ + return calc_pct_ratio(want, target); +} + +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} + +/* + * Under "urgent" reclaim, we will reclaim even fresh block groups that have + * recently seen successful allocations, as we are desperate to reclaim + * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. + */ +static bool is_reclaim_urgent(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + return unalloc < data_chunk_size; +} + +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) +{ + struct btrfs_block_group *bg; + int thresh_pct; + bool try_again = true; + bool urgent; + + spin_lock(&space_info->lock); + urgent = is_reclaim_urgent(space_info); + thresh_pct = btrfs_calc_reclaim_threshold(space_info); + spin_unlock(&space_info->lock); + + down_read(&space_info->groups_sem); +again: + list_for_each_entry(bg, &space_info->block_groups[raid], list) { + u64 thresh; + bool reclaim = false; + + btrfs_get_block_group(bg); + spin_lock(&bg->lock); + thresh = mult_perc(bg->length, thresh_pct); + if (bg->used < thresh && bg->reclaim_mark) { + try_again = false; + reclaim = true; + } + bg->reclaim_mark++; + spin_unlock(&bg->lock); + if (reclaim) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + } + + /* + * In situations where we are very motivated to reclaim (low unalloc) + * use two passes to make the reclaim mark check best effort. + * + * If we have any staler groups, we don't touch the fresher ones, but if we + * really need a block group, do take a fresh one. + */ + if (try_again && urgent) { + try_again = false; + goto again; + } + + up_read(&space_info->groups_sem); +} + +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) +{ + u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info); + + lockdep_assert_held(&space_info->lock); + space_info->reclaimable_bytes += bytes; + + if (space_info->reclaimable_bytes >= chunk_sz) + btrfs_set_periodic_reclaim_ready(space_info, true); +} + +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready) +{ + lockdep_assert_held(&space_info->lock); + if (!READ_ONCE(space_info->periodic_reclaim)) + return; + if (ready != space_info->periodic_reclaim_ready) { + space_info->periodic_reclaim_ready = ready; + if (!ready) + space_info->reclaimable_bytes = 0; + } +} + +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +{ + bool ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return false; + if (!READ_ONCE(space_info->periodic_reclaim)) + return false; + + spin_lock(&space_info->lock); + ret = space_info->periodic_reclaim_ready; + btrfs_set_periodic_reclaim_ready(space_info, false); + spin_unlock(&space_info->lock); + + return ret; +} + +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) +{ + int raid; + struct btrfs_space_info *space_info; + + list_for_each_entry(space_info, &fs_info->space_info, list) { + if (!btrfs_should_periodic_reclaim(space_info)) + continue; + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(fs_info, space_info, raid); + } +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 08a3bd10addc..efbecc0c5258 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,8 +3,18 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include <trace/events/btrfs.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/kobject.h> +#include <linux/lockdep.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include "volumes.h" +struct btrfs_fs_info; +struct btrfs_block_group; + /* * Different levels for to flush space when doing space reservations. * @@ -84,6 +94,7 @@ enum btrfs_flush_state { }; struct btrfs_space_info { + struct btrfs_fs_info *fs_info; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -155,6 +166,47 @@ struct btrfs_space_info { struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; + + /* + * Monotonically increasing counter of block group reclaim attempts + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count + */ + u64 reclaim_count; + + /* + * Monotonically increasing counter of reclaimed bytes + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes + */ + u64 reclaim_bytes; + + /* + * Monotonically increasing counter of reclaim errors + * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors + */ + u64 reclaim_errors; + + /* + * If true, use the dynamic relocation threshold, instead of the + * fixed bg_reclaim_threshold. + */ + bool dynamic_reclaim; + + /* + * Periodically check all block groups against the reclaim + * threshold in the cleaner thread. + */ + bool periodic_reclaim; + + /* + * Periodic reclaim should be a no-op if a space_info hasn't + * freed any space since the last time we tried. + */ + bool periodic_reclaim_ready; + + /* + * Net bytes freed or allocated since the last reclaim pass. + */ + s64 reclaimable_bytes; }; struct reserve_ticket { @@ -165,7 +217,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -206,20 +258,20 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( @@ -238,4 +290,10 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index b98d42ca5564..88a01d51ab11 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,7 +64,8 @@ * This means a slightly higher tree locking latency. */ -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +#if PAGE_SIZE > SZ_4K +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) return false; @@ -74,8 +75,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) * mapping. And if page->mapping->host is data inode, it's subpage. * As we have ruled our sectorsize >= PAGE_SIZE case already. */ - if (!page->mapping || !page->mapping->host || - is_data_inode(page->mapping->host)) + if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) return true; /* @@ -86,37 +86,10 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) return true; return false; } - -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} +#endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type) + struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; @@ -124,31 +97,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ - if (page->mapping) - ASSERT(PageLocked(page)); + if (folio->mapping) + ASSERT(folio_test_locked(folio)); - /* Either not subpage, or the page already has private attached */ - if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); if (IS_ERR(subpage)) return PTR_ERR(subpage); - attach_page_private(page, subpage); + folio_attach_private(folio, subpage); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - /* Either not subpage, or already detached */ - if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) return; - subpage = detach_page_private(page); + subpage = folio_detach_private(folio); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -162,18 +134,16 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -188,243 +158,251 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) * This is important for eb allocation, to prevent race with last eb freeing * of the same page. * With the eb_refs increased before the eb inserted into radix tree, - * detach_extent_buffer_page() won't detach the page private while we're still + * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); atomic_inc(&subpage->eb_refs); } -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { + /* For subpage support, the folio must be single page. */ + ASSERT(folio_order(folio) == 0); + /* Basic checks */ - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ - if (page->mapping) - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + if (folio->mapping) + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); } -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const int nbits = len >> fs_info->sectorsize_bits; - - btrfs_subpage_assert(fs_info, page, start, len); - - atomic_add(nbits, &subpage->readers); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const int nbits = len >> fs_info->sectorsize_bits; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, page, start, len); - is_data = is_data_inode(page->mapping->host); - ASSERT(atomic_read(&subpage->readers) >= nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - unlock_page(page); -} +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ +({ \ + unsigned int __start_bit; \ + \ + btrfs_subpage_assert(fs_info, folio, start, len); \ + __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit; \ +}) -static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; u32 orig_len = *len; - *start = max_t(u64, page_offset(page), orig_start); + *start = max_t(u64, folio_pos(folio), orig_start); /* * For certain call sites like btrfs_drop_pages(), we may have pages * beyond the target range. In that case, just set @len to 0, subpage * helpers can handle @len == 0 without any problem. */ - if (page_offset(page) >= orig_start + orig_len) + if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, page_offset(page) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, orig_start + orig_len) - *start; } -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const int nbits = (len >> fs_info->sectorsize_bits); - int ret; - - btrfs_subpage_assert(fs_info, page, start, len); - - ASSERT(atomic_read(&subpage->readers) == 0); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); -} - -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; + bool last; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->nr_locked) == 0) { + spin_unlock_irqrestore(&subpage->lock, flags); return true; + } - ASSERT(atomic_read(&subpage->writers) >= nbits); - return atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); + spin_unlock_irqrestore(&subpage->lock, flags); + return last; } /* - * Lock a page for delalloc page writeback. + * Handle different locked folios: + * + * - Non-subpage folio + * Just unlock it. * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. */ -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { - lock_page(page); - return 0; + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; } - lock_page(page); - if (!PagePrivate(page) || !page->private) { - unlock_page(page); - return -EAGAIN; + + /* + * For subpage case, there are two types of locked page. With or + * without locked number. + * + * Since we own the page lock, no one else could touch subpage::locked + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ + folio_unlock(folio); + return; } - btrfs_subpage_clamp_range(page, &start, &len); - btrfs_subpage_start_writer(fs_info, page, start, len); - return 0; + + btrfs_subpage_clamp_range(folio, &start, &len); + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) + folio_unlock(folio); } -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) - return unlock_page(page); - btrfs_subpage_clamp_range(page, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) - unlock_page(page); -} + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; -#define subpage_calc_start_bit(fs_info, page, name, start, len) \ -({ \ - unsigned int start_bit; \ - \ - btrfs_subpage_assert(fs_info, page, start, len); \ - start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - start_bit += fs_info->subpage_info->name##_offset; \ - start_bit; \ -}) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) - SetPageUptodate(page); + folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageUptodate(page); + folio_clear_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); - set_page_dirty(page); + folio_mark_dirty(folio); } /* @@ -438,10 +416,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, * extra handling for tree blocks. */ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; @@ -455,101 +433,102 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, } void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { bool last; - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len); if (last) - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); } void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - set_page_writeback(page); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { - ASSERT(PageWriteback(page)); - end_page_writeback(page); + ASSERT(folio_test_writeback(folio)); + folio_end_writeback(folio); } spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageOrdered(page); + folio_set_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) - ClearPageOrdered(page); + folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) - SetPageChecked(page); + folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageChecked(page); + folio_clear_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -559,10 +538,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, */ #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ + struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ @@ -584,177 +563,199 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall * back to regular sectorsize branch. */ -#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ - test_page_func) \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \ + folio_clear_func, folio_test_func) \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - btrfs_subpage_clamp_range(page, &start, &len); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ -} -IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, - PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, - PageDirty); -IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, - PageWriteback); -IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, - PageOrdered); -IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, + folio_test_uptodate); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io, + folio_test_dirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback, + folio_test_writeback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, + folio_test_ordered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, + folio_test_checked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage; + unsigned int start_bit; + unsigned int nbits; + unsigned long flags; if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - ASSERT(!PageDirty(page)); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + ASSERT(!folio_test_dirty(folio)); return; + } - ASSERT(PagePrivate(page) && page->private); - ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); + start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); + nbits = len >> fs_info->sectorsize_bits; + subpage = folio_get_private(folio); + ASSERT(subpage); + spin_lock_irqsave(&subpage->lock, flags); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + spin_unlock_irqrestore(&subpage->lock, flags); } /* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). + * This is for folio already locked by plain lock_page()/folio_lock(), which + * doesn't have any subpage awareness. * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. + * This populates the involved subpage ranges so that subpage helpers can + * properly unlock them. */ -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; + unsigned long flags; + unsigned int start_bit; + unsigned int nbits; + int ret; - ASSERT(PageLocked(page)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, page)) - return unlock_page(page); - - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) - /* No writers, locked by plain lock_page() */ - return unlock_page(page); + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + return; - /* Have writers, use proper subpage helper to end it */ - btrfs_page_end_writer_lock(fs_info, page, start, len); + subpage = folio_get_private(folio); + start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); + nbits = len >> fs_info->sectorsize_bits; + spin_lock_irqsave(&subpage->lock, flags); + /* Target range should not yet be locked. */ + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); + ret = atomic_add_return(nbits, &subpage->nr_locked); + ASSERT(ret <= fs_info->sectors_per_page); + spin_unlock_irqrestore(&subpage->lock, flags); } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ - const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \ + const int sectors_per_page = fs_info->sectors_per_page; \ \ - ASSERT(bitmap_nr_bits < BITS_PER_LONG); \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - subpage_info->name##_offset, \ - bitmap_nr_bits); \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ } void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; - unsigned long error_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; - ASSERT(PagePrivate(page) && page->private); - ASSERT(subpage_info); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(sectors_per_page > 1); + subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); - dump_page(page, "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", - start, len, page_offset(page), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &error_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", + start, len, folio_pos(folio), + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &locked_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); +} + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(fs_info->sectors_per_page > 1); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5cbf67ccbdeb..44fff1f4eac4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -4,6 +4,12 @@ #define BTRFS_SUBPAGE_H #include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/sizes.h> + +struct address_space; +struct folio; +struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -13,29 +19,23 @@ * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_start indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -45,14 +45,6 @@ struct btrfs_subpage_info { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -73,71 +69,67 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +#if PAGE_SIZE > SZ_4K +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page); + struct folio *folio, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); - -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); - -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * - * btrfs_subpage_*() are for call sites where the page has subpage attached and - * the range is ensured to be inside the page. + * btrfs_subpage_*() are for call sites where the folio has subpage attached and + * the range is ensured to be inside the folio's single page. * - * btrfs_page_*() are for call sites where the page can either be subpage - * specific or regular page. The function will handle both cases. - * But the range still needs to be inside the page. + * btrfs_folio_*() are for call sites where the page can either be subpage + * specific or regular folios. The function will handle both cases. + * But the range still needs to be inside one single page. * - * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -145,14 +137,28 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, + struct folio *locked_folio, + u64 start, u32 len) +{ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} + bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len); +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e33587a81409..6119a06b0569 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,19 +26,20 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include <linux/security.h> +#include <linux/fs_parser.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" +#include "direct-io.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" @@ -63,19 +64,7 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; - -/* - * Types for mounting the default subvolume and a subvolume explicitly - * requested by subvol=/path. That way the callchain is straightforward and we - * don't have to play tricks with the mount options and recursive calls to - * btrfs_mount. - * - * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. - */ static struct file_system_type btrfs_fs_type; -static struct file_system_type btrfs_root_fs_type; - -static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { @@ -85,8 +74,22 @@ static void btrfs_put_super(struct super_block *sb) close_ctree(fs_info); } +/* Store the mount options related information. */ +struct btrfs_fs_context { + char *subvol_name; + u64 subvol_objectid; + u64 max_inline; + u32 commit_interval; + u32 metadata_ratio; + u32 thread_pool_size; + unsigned long long mount_opt; + unsigned long compress_type:4; + unsigned int compress_level; + refcount_t refs; +}; + enum { - Opt_acl, Opt_noacl, + Opt_acl, Opt_clear_cache, Opt_commit_interval, Opt_compress, @@ -96,48 +99,38 @@ enum { Opt_degraded, Opt_device, Opt_fatal_errors, - Opt_flushoncommit, Opt_noflushoncommit, + Opt_flushoncommit, Opt_max_inline, - Opt_barrier, Opt_nobarrier, - Opt_datacow, Opt_nodatacow, - Opt_datasum, Opt_nodatasum, - Opt_defrag, Opt_nodefrag, - Opt_discard, Opt_nodiscard, + Opt_barrier, + Opt_datacow, + Opt_datasum, + Opt_defrag, + Opt_discard, Opt_discard_mode, - Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, Opt_skip_balance, - Opt_space_cache, Opt_no_space_cache, + Opt_space_cache, Opt_space_cache_version, - Opt_ssd, Opt_nossd, - Opt_ssd_spread, Opt_nossd_spread, + Opt_ssd, + Opt_ssd_spread, Opt_subvol, Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, - Opt_treelog, Opt_notreelog, + Opt_treelog, Opt_user_subvol_rm_allowed, + Opt_norecovery, /* Rescue options */ Opt_rescue, Opt_usebackuproot, Opt_nologreplay, - Opt_ignorebadroots, - Opt_ignoredatacsums, - Opt_rescue_all, - - /* Deprecated options */ - Opt_recovery, - Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ - Opt_check_integrity, - Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, - Opt_enospc_debug, Opt_noenospc_debug, + Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG - Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, + Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, @@ -145,826 +138,651 @@ enum { Opt_err, }; -static const match_table_t tokens = { - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_commit_interval, "commit=%u"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_degraded, "degraded"}, - {Opt_device, "device=%s"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_noflushoncommit, "noflushoncommit"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_noinode_cache, "noinode_cache"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_datacow, "datacow"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_datasum, "datasum"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_defrag, "autodefrag"}, - {Opt_nodefrag, "noautodefrag"}, - {Opt_discard, "discard"}, - {Opt_discard_mode, "discard=%s"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_norecovery, "norecovery"}, - {Opt_ratio, "metadata_ratio=%u"}, - {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_space_cache, "space_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_space_cache_version, "space_cache=%s"}, - {Opt_ssd, "ssd"}, - {Opt_nossd, "nossd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd_spread, "nossd_spread"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvol_empty, "subvol="}, - {Opt_subvolid, "subvolid=%s"}, - {Opt_thread_pool, "thread_pool=%u"}, - {Opt_treelog, "treelog"}, - {Opt_notreelog, "notreelog"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, +enum { + Opt_fatal_errors_panic, + Opt_fatal_errors_bug, +}; - /* Rescue options */ - {Opt_rescue, "rescue=%s"}, +static const struct constant_table btrfs_parameter_fatal_errors[] = { + { "panic", Opt_fatal_errors_panic }, + { "bug", Opt_fatal_errors_bug }, + {} +}; + +enum { + Opt_discard_sync, + Opt_discard_async, +}; + +static const struct constant_table btrfs_parameter_discard[] = { + { "sync", Opt_discard_sync }, + { "async", Opt_discard_async }, + {} +}; + +enum { + Opt_space_cache_v1, + Opt_space_cache_v2, +}; + +static const struct constant_table btrfs_parameter_space_cache[] = { + { "v1", Opt_space_cache_v1 }, + { "v2", Opt_space_cache_v2 }, + {} +}; + +enum { + Opt_rescue_usebackuproot, + Opt_rescue_nologreplay, + Opt_rescue_ignorebadroots, + Opt_rescue_ignoredatacsums, + Opt_rescue_ignoremetacsums, + Opt_rescue_ignoresuperflags, + Opt_rescue_parameter_all, +}; + +static const struct constant_table btrfs_parameter_rescue[] = { + { "usebackuproot", Opt_rescue_usebackuproot }, + { "nologreplay", Opt_rescue_nologreplay }, + { "ignorebadroots", Opt_rescue_ignorebadroots }, + { "ibadroots", Opt_rescue_ignorebadroots }, + { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "ignoremetacsums", Opt_rescue_ignoremetacsums}, + { "ignoresuperflags", Opt_rescue_ignoresuperflags}, + { "idatacsums", Opt_rescue_ignoredatacsums }, + { "imetacsums", Opt_rescue_ignoremetacsums}, + { "isuperflags", Opt_rescue_ignoresuperflags}, + { "all", Opt_rescue_parameter_all }, + {} +}; + +#ifdef CONFIG_BTRFS_DEBUG +enum { + Opt_fragment_parameter_data, + Opt_fragment_parameter_metadata, + Opt_fragment_parameter_all, +}; + +static const struct constant_table btrfs_parameter_fragment[] = { + { "data", Opt_fragment_parameter_data }, + { "metadata", Opt_fragment_parameter_metadata }, + { "all", Opt_fragment_parameter_all }, + {} +}; +#endif + +static const struct fs_parameter_spec btrfs_fs_parameters[] = { + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("autodefrag", Opt_defrag), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("clear_cache", Opt_clear_cache), + fsparam_u32("commit", Opt_commit_interval), + fsparam_flag("compress", Opt_compress), + fsparam_string("compress", Opt_compress_type), + fsparam_flag("compress-force", Opt_compress_force), + fsparam_string("compress-force", Opt_compress_force_type), + fsparam_flag_no("datacow", Opt_datacow), + fsparam_flag_no("datasum", Opt_datasum), + fsparam_flag("degraded", Opt_degraded), + fsparam_string("device", Opt_device), + fsparam_flag_no("discard", Opt_discard), + fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), + fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), + fsparam_flag_no("flushoncommit", Opt_flushoncommit), + fsparam_string("max_inline", Opt_max_inline), + fsparam_u32("metadata_ratio", Opt_ratio), + fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), + fsparam_flag("skip_balance", Opt_skip_balance), + fsparam_flag_no("space_cache", Opt_space_cache), + fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), + fsparam_flag_no("ssd", Opt_ssd), + fsparam_flag_no("ssd_spread", Opt_ssd_spread), + fsparam_string("subvol", Opt_subvol), + fsparam_flag("subvol=", Opt_subvol_empty), + fsparam_u64("subvolid", Opt_subvolid), + fsparam_u32("thread_pool", Opt_thread_pool), + fsparam_flag_no("treelog", Opt_treelog), + fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), + + /* Rescue options. */ + fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), /* Deprecated, with alias rescue=nologreplay */ - {Opt_nologreplay, "nologreplay"}, + __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ - {Opt_usebackuproot, "usebackuproot"}, + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), + /* For compatibility only, alias for "rescue=nologreplay". */ + fsparam_flag("norecovery", Opt_norecovery), - /* Deprecated options */ - {Opt_recovery, "recovery"}, - - /* Debugging options */ - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, - {Opt_enospc_debug, "enospc_debug"}, - {Opt_noenospc_debug, "noenospc_debug"}, + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG - {Opt_fragment_data, "fragment=data"}, - {Opt_fragment_metadata, "fragment=metadata"}, - {Opt_fragment_all, "fragment=all"}, + fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - {Opt_ref_verify, "ref_verify"}, + fsparam_flag("ref_verify", Opt_ref_verify), #endif - {Opt_err, NULL}, + {} }; -static const match_table_t rescue_tokens = { - {Opt_usebackuproot, "usebackuproot"}, - {Opt_nologreplay, "nologreplay"}, - {Opt_ignorebadroots, "ignorebadroots"}, - {Opt_ignorebadroots, "ibadroots"}, - {Opt_ignoredatacsums, "ignoredatacsums"}, - {Opt_ignoredatacsums, "idatacsums"}, - {Opt_rescue_all, "all"}, - {Opt_err, NULL}, -}; - -static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, - const char *opt_name) +/* No support for restricting writes to btrfs devices yet... */ +static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) { - if (fs_info->mount_opt & opt) { - btrfs_err(fs_info, "%s must be used with ro mount option", - opt_name); - return true; - } - return false; + return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; } -static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *opts; - char *orig; - char *p; - substring_t args[MAX_OPT_ARGS]; - int ret = 0; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + opt = fs_parse(fc, btrfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opts, ":")) != NULL) { - int token; + switch (opt) { + case Opt_degraded: + btrfs_set_opt(ctx->mount_opt, DEGRADED); + break; + case Opt_subvol_empty: + /* + * This exists because we used to allow it on accident, so we're + * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow + * empty subvol= again"). + */ + break; + case Opt_subvol: + kfree(ctx->subvol_name); + ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); + if (!ctx->subvol_name) + return -ENOMEM; + break; + case Opt_subvolid: + ctx->subvol_objectid = result.uint_64; - if (!*p) - continue; - token = match_token(p, rescue_tokens, args); - switch (token){ - case Opt_usebackuproot: - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); - break; - case Opt_nologreplay: - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_ignorebadroots: - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - break; - case Opt_ignoredatacsums: - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - break; - case Opt_rescue_all: - btrfs_info(info, "enabling all of the rescue options"); - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_err: - btrfs_info(info, "unrecognized rescue option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + /* subvolid=0 means give me the original fs_tree. */ + if (!ctx->subvol_objectid) + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; + break; + case Opt_device: { + struct btrfs_device *device; + blk_mode_t mode = btrfs_open_mode(fc); + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(param->string, mode, false); + mutex_unlock(&uuid_mutex); + if (IS_ERR(device)) + return PTR_ERR(device); + break; } -out: - kfree(orig); - return ret; -} - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags) -{ - substring_t args[MAX_OPT_ARGS]; - char *p, *num; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - enum btrfs_compression_type saved_compress_type; - int saved_compress_level; - bool saved_compress_force; - int no_compress = 0; - const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); - - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) - btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (btrfs_free_space_cache_v1_active(info)) { - if (btrfs_is_zoned(info)) { - btrfs_info(info, - "zoned: clearing existing space cache"); - btrfs_set_super_cache_generation(info->super_copy, 0); + case Opt_datasum: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NODATASUM); } else { - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); } - } - - /* - * Even the options are empty, we still need to do extra check - * against new flags - */ - if (!options) - goto check; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - btrfs_info(info, "allowing degraded mounts"); - btrfs_set_opt(info->mount_opt, DEGRADED); - break; - case Opt_subvol: - case Opt_subvol_empty: - case Opt_subvolid: - case Opt_device: - /* - * These are parsed by btrfs_parse_subvol_options or - * btrfs_parse_device_options and can be ignored here. - */ - break; - case Opt_nodatasum: - btrfs_set_and_info(info, NODATASUM, - "setting nodatasum"); - break; - case Opt_datasum: - if (btrfs_test_opt(info, NODATASUM)) { - if (btrfs_test_opt(info, NODATACOW)) - btrfs_info(info, - "setting datasum, datacow enabled"); - else - btrfs_info(info, "setting datasum"); - } - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - break; - case Opt_nodatacow: - if (!btrfs_test_opt(info, NODATACOW)) { - if (!btrfs_test_opt(info, COMPRESS) || - !btrfs_test_opt(info, FORCE_COMPRESS)) { - btrfs_info(info, - "setting nodatacow, compression disabled"); - } else { - btrfs_info(info, "setting nodatacow"); - } - } - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_datacow: - btrfs_clear_and_info(info, NODATACOW, - "setting datacow"); - break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - fallthrough; - case Opt_compress: - case Opt_compress_type: - saved_compress_type = btrfs_test_opt(info, - COMPRESS) ? - info->compress_type : BTRFS_COMPRESS_NONE; - saved_compress_force = - btrfs_test_opt(info, FORCE_COMPRESS); - saved_compress_level = info->compress_level; - if (token == Opt_compress || - token == Opt_compress_force || - strncmp(args[0].from, "zlib", 4) == 0) { - compress_type = "zlib"; - - info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - /* - * args[0] contains uninitialized data since - * for these tokens we don't expect any - * parameter. - */ - if (token != Opt_compress && - token != Opt_compress_force) - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZLIB, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - no_compress = 0; - } else if (strncmp(args[0].from, "lzo", 3) == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - info->compress_level = 0; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_LZO); - no_compress = 0; - } else if (strncmp(args[0].from, "zstd", 4) == 0) { - compress_type = "zstd"; - info->compress_type = BTRFS_COMPRESS_ZSTD; - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZSTD, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_ZSTD); - no_compress = 0; - } else if (strncmp(args[0].from, "no", 2) == 0) { - compress_type = "no"; - info->compress_level = 0; - info->compress_type = 0; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - compress_force = false; - no_compress++; - } else { - btrfs_err(info, "unrecognized compression value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - } else { - /* - * If we remount from compress-force=xxx to - * compress=xxx, we need clear FORCE_COMPRESS - * flag, otherwise, there is no way for users - * to disable forcible compression separately. - */ - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - } - if (no_compress == 1) { - btrfs_info(info, "use no compression"); - } else if ((info->compress_type != saved_compress_type) || - (compress_force != saved_compress_force) || - (info->compress_level != saved_compress_level)) { - btrfs_info(info, "%s %s compression, level %d", - (compress_force) ? "force" : "use", - compress_type, info->compress_level); - } - compress_force = false; - break; - case Opt_ssd: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_ssd_spread: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_set_and_info(info, SSD_SPREAD, - "using spread ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_nossd: - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_and_info(info, SSD, - "not using ssd optimizations"); - fallthrough; - case Opt_nossd_spread: - btrfs_clear_and_info(info, SSD_SPREAD, - "not using spread ssd allocation scheme"); - break; - case Opt_barrier: - btrfs_clear_and_info(info, NOBARRIER, - "turning on barriers"); - break; - case Opt_nobarrier: - btrfs_set_and_info(info, NOBARRIER, - "turning off barriers"); - break; - case Opt_thread_pool: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized thread_pool value %s", - args[0].from); - goto out; - } else if (intarg == 0) { - btrfs_err(info, "invalid value 0 for thread_pool"); - ret = -EINVAL; - goto out; - } - info->thread_pool_size = intarg; - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = min_t(u64, - info->max_inline, - info->sectorsize); - } - btrfs_info(info, "max_inline at %llu", - info->max_inline); - } else { - ret = -ENOMEM; - goto out; - } - break; - case Opt_acl: + break; + case Opt_datacow: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(ctx->mount_opt, NODATACOW); + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + } + break; + case Opt_compress_force: + case Opt_compress_force_type: + btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); + fallthrough; + case Opt_compress: + case Opt_compress_type: + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears + * "force-compress" without the need to pass + * "compress-force=[no|none]" before specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zlib", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "lzo", 3) == 0) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zstd", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "no", 2) == 0) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", + param->string); + return -EINVAL; + } + break; + case Opt_ssd: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSSD); + btrfs_clear_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_ssd_spread: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_barrier: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOBARRIER); + else + btrfs_clear_opt(ctx->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + if (result.uint_32 == 0) { + btrfs_err(NULL, "invalid value 0 for thread_pool"); + return -EINVAL; + } + ctx->thread_pool_size = result.uint_32; + break; + case Opt_max_inline: + ctx->max_inline = memparse(param->string, NULL); + break; + case Opt_acl: + if (result.negated) { + fc->sb_flags &= ~SB_POSIXACL; + } else { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - btrfs_err(info, "support for ACL not compiled in!"); - ret = -EINVAL; - goto out; + btrfs_err(NULL, "support for ACL not compiled in"); + return -EINVAL; #endif - case Opt_noacl: - info->sb->s_flags &= ~SB_POSIXACL; - break; - case Opt_notreelog: - btrfs_set_and_info(info, NOTREELOG, - "disabling tree log"); - break; - case Opt_treelog: - btrfs_clear_and_info(info, NOTREELOG, - "enabling tree log"); - break; - case Opt_norecovery: - case Opt_nologreplay: - btrfs_warn(info, + } + /* + * VFS limits the ability to toggle ACL on and off via remount, + * despite every file system allowing this. This seems to be + * an oversight since we all do, but it'll fail if we're + * remounting. So don't set the mask here, we'll check it in + * btrfs_reconfigure and do the toggling ourselves. + */ + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + fc->sb_flags_mask |= SB_POSIXACL; + break; + case Opt_treelog: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOTREELOG); + else + btrfs_clear_opt(ctx->mount_opt, NOTREELOG); + break; + case Opt_nologreplay: + btrfs_warn(NULL, "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_flushoncommit: - btrfs_set_and_info(info, FLUSHONCOMMIT, - "turning on flush-on-commit"); - break; - case Opt_noflushoncommit: - btrfs_clear_and_info(info, FLUSHONCOMMIT, - "turning off flush-on-commit"); - break; - case Opt_ratio: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized metadata_ratio value %s", - args[0].from); - goto out; - } - info->metadata_ratio = intarg; - btrfs_info(info, "metadata ratio %u", - info->metadata_ratio); - break; - case Opt_discard: - case Opt_discard_mode: - if (token == Opt_discard || - strcmp(args[0].from, "sync") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); - btrfs_set_and_info(info, DISCARD_SYNC, - "turning on sync discard"); - } else if (strcmp(args[0].from, "async") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); - btrfs_set_and_info(info, DISCARD_ASYNC, - "turning on async discard"); - } else { - btrfs_err(info, "unrecognized discard mode value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - btrfs_clear_opt(info->mount_opt, NODISCARD); - break; - case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD_SYNC, - "turning off discard"); - btrfs_clear_and_info(info, DISCARD_ASYNC, - "turning off async discard"); - btrfs_set_opt(info->mount_opt, NODISCARD); - break; - case Opt_space_cache: - case Opt_space_cache_version: - /* - * We already set FREE_SPACE_TREE above because we have - * compat_ro(FREE_SPACE_TREE) set, and we aren't going - * to allow v1 to be set for extent tree v2, simply - * ignore this setting if we're extent tree v2. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (token == Opt_space_cache || - strcmp(args[0].from, "v1") == 0) { - btrfs_clear_opt(info->mount_opt, - FREE_SPACE_TREE); - btrfs_set_and_info(info, SPACE_CACHE, - "enabling disk space caching"); - } else if (strcmp(args[0].from, "v2") == 0) { - btrfs_clear_opt(info->mount_opt, - SPACE_CACHE); - btrfs_set_and_info(info, FREE_SPACE_TREE, - "enabling free space tree"); - } else { - btrfs_err(info, "unrecognized space_cache value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - break; - case Opt_rescan_uuid_tree: - btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); - break; - case Opt_no_space_cache: - /* - * We cannot operate without the free space tree with - * extent tree v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (btrfs_test_opt(info, SPACE_CACHE)) { - btrfs_clear_and_info(info, SPACE_CACHE, - "disabling disk space caching"); - } - if (btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_clear_and_info(info, FREE_SPACE_TREE, - "disabling free space tree"); - } - break; - case Opt_inode_cache: - case Opt_noinode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and has no effect since 5.11"); - break; - case Opt_clear_cache: - /* - * We cannot clear the free space tree with extent tree - * v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - btrfs_set_and_info(info, CLEAR_CACHE, - "force clearing of disk cache"); - break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_norecovery: + btrfs_info(NULL, +"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); + else + btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + ctx->metadata_ratio = result.uint_32; + break; + case Opt_discard: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, NODISCARD); + } else { + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + } + break; + case Opt_discard_mode: + switch (result.uint_32) { + case Opt_discard_sync: + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + case Opt_discard_async: + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); break; - case Opt_noenospc_debug: - btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + default: + btrfs_err(NULL, "unrecognized discard mode value %s", + param->key); + return -EINVAL; + } + btrfs_clear_opt(ctx->mount_opt, NODISCARD); + break; + case Opt_space_cache: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + } else { + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + } + break; + case Opt_space_cache_version: + switch (result.uint_32) { + case Opt_space_cache_v1: + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_defrag: - btrfs_set_and_info(info, AUTO_DEFRAG, - "enabling auto defrag"); + case Opt_space_cache_v2: + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_nodefrag: - btrfs_clear_and_info(info, AUTO_DEFRAG, - "disabling auto defrag"); + default: + btrfs_err(NULL, "unrecognized space_cache value %s", + param->key); + return -EINVAL; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_clear_cache: + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); + else + btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); + else + btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); + break; + case Opt_usebackuproot: + btrfs_warn(NULL, + "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + + /* If we're loading the backup roots we can't trust the space cache. */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_skip_balance: + btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); + break; + case Opt_fatal_errors: + switch (result.uint_32) { + case Opt_fatal_errors_panic: + btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_recovery: - case Opt_usebackuproot: - btrfs_warn(info, - "'%s' is deprecated, use 'rescue=usebackuproot' instead", - token == Opt_recovery ? "recovery" : - "usebackuproot"); - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + case Opt_fatal_errors_bug: + btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + default: + btrfs_err(NULL, "unrecognized fatal_errors value %s", + param->key); + return -EINVAL; + } + break; + case Opt_commit_interval: + ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } + if (ctx->commit_interval == 0) + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + break; + case Opt_rescue: + switch (result.uint_32) { + case Opt_rescue_usebackuproot: + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, - "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + case Opt_rescue_nologreplay: + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; - case Opt_check_integrity: - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, "enabling check integrity"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + case Opt_rescue_ignorebadroots: + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); break; - case Opt_check_integrity_print_mask: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, - "unrecognized check_integrity_print_mask value %s", - args[0].from); - goto out; - } - info->check_integrity_print_mask = intarg; - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, "check_integrity_print_mask 0x%x", - info->check_integrity_print_mask); + case Opt_rescue_ignoredatacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - btrfs_err(info, - "support for check_integrity* not compiled in!"); - ret = -EINVAL; - goto out; -#endif - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) { - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else if (strcmp(args[0].from, "bug") == 0) { - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else { - btrfs_err(info, "unrecognized fatal_errors value %s", - args[0].from); - ret = -EINVAL; - goto out; - } + case Opt_rescue_ignoremetacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); break; - case Opt_commit_interval: - intarg = 0; - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized commit_interval value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - if (intarg == 0) { - btrfs_info(info, - "using default commit interval %us", - BTRFS_DEFAULT_COMMIT_INTERVAL); - intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; - } else if (intarg > 300) { - btrfs_warn(info, "excessive commit interval %d", - intarg); - } - info->commit_interval = intarg; + case Opt_rescue_ignoresuperflags: + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); break; - case Opt_rescue: - ret = parse_rescue_options(info, args[0].from); - if (ret < 0) { - btrfs_err(info, "unrecognized rescue value %s", - args[0].from); - goto out; - } + case Opt_rescue_parameter_all: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS); + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; + default: + btrfs_info(NULL, "unrecognized rescue option '%s'", + param->key); + return -EINVAL; + } + break; #ifdef CONFIG_BTRFS_DEBUG - case Opt_fragment_all: - btrfs_info(info, "fragmenting all space"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); - btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + case Opt_fragment: + switch (result.uint_32) { + case Opt_fragment_parameter_all: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_metadata: - btrfs_info(info, "fragmenting metadata"); - btrfs_set_opt(info->mount_opt, - FRAGMENT_METADATA); + case Opt_fragment_parameter_metadata: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_data: - btrfs_info(info, "fragmenting data"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + case Opt_fragment_parameter_data: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); break; + default: + btrfs_info(NULL, "unrecognized fragment option '%s'", + param->key); + return -EINVAL; + } + break; #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - case Opt_ref_verify: - btrfs_info(info, "doing ref verification"); - btrfs_set_opt(info->mount_opt, REF_VERIFY); - break; + case Opt_ref_verify: + btrfs_set_opt(ctx->mount_opt, REF_VERIFY); + break; #endif - case Opt_err: - btrfs_err(info, "unrecognized mount option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + default: + btrfs_err(NULL, "unrecognized mount option '%s'", param->key); + return -EINVAL; } -check: - /* We're read-only, don't have to check. */ - if (new_flags & SB_RDONLY) - goto out; - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) - ret = -EINVAL; -out: - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, CLEAR_CACHE)) { - btrfs_err(info, "cannot disable free space tree"); - ret = -EINVAL; - } - if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); - ret = -EINVAL; - } - if (!ret) - ret = btrfs_check_mountopts_zoned(info); - if (!ret && !remounting) { - if (btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); - } - return ret; + return 0; } /* - * Parse mount options that are required early in the mount process. - * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount code + * paths. */ -static int btrfs_parse_device_options(const char *options, blk_mode_t flags) +static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - struct btrfs_device *device = NULL; - int error = 0; + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); +} - lockdep_assert_held(&uuid_mutex); +static bool check_ro_option(const struct btrfs_fs_info *fs_info, + unsigned long long mount_opt, unsigned long long opt, + const char *opt_name) +{ + if (mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} - if (!options) - return 0; +bool btrfs_check_options(const struct btrfs_fs_info *info, + unsigned long long *mount_opt, + unsigned long flags) +{ + bool ret = true; - /* - * strsep changes the string, duplicate it because btrfs_parse_options - * gets called later - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (!(flags & SB_RDONLY) && + (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags"))) + ret = false; - while ((p = strsep(&opts, ",")) != NULL) { - int token; + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free-space-tree"); + ret = false; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); + ret = false; + } - if (!*p) - continue; + if (btrfs_check_mountopts_zoned(info, mount_opt)) + ret = false; - token = match_token(p, tokens, args); - if (token == Opt_device) { - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - device = btrfs_scan_one_device(device_name, flags); - kfree(device_name); - if (IS_ERR(device)) { - error = PTR_ERR(device); - goto out; - } + if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { + btrfs_info(info, "disk space caching is enabled"); + btrfs_warn(info, +"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); } + if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) + btrfs_info(info, "using free-space-tree"); } -out: - kfree(orig); - return error; + return ret; } /* - * Parse mount options that are related to subvolume id + * This is subtle, we only call this during open_ctree(). We need to pre-load + * the mount options with the on-disk settings. Before the new mount API took + * effect we would do this on mount and remount. With the new mount API we'll + * only do this on the initial mount. * - * The value is later passed to mount_subvol() + * This isn't a change in behavior, because we're using the current state of the + * file system to set the current mount options. If you mounted with special + * options to disable these features and then remounted we wouldn't revert the + * settings, because mounting without these features cleared the on-disk + * settings, so this being called on re-mount is not needed. */ -static int btrfs_parse_subvol_options(const char *options, char **subvol_name, - u64 *subvol_objectid) +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; - int error = 0; - u64 subvolid; - - if (!options) - return 0; + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, + "forcing free space tree for sector size %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + } + } /* - * strsep changes the string, duplicate it because - * btrfs_parse_device_options gets called later + * At this point our mount options are populated, so we only mess with + * these settings if we don't have any settings already. */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + return; - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; + if (btrfs_is_zoned(fs_info) && + btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_info(fs_info, "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + return; + } - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - if (!*subvol_name) { - error = -ENOMEM; - goto out; - } - break; - case Opt_subvolid: - error = match_u64(&args[0], &subvolid); - if (error) - goto out; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + return; - /* we want the original fs_tree */ - if (subvolid == 0) - subvolid = BTRFS_FS_TREE_OBJECTID; + if (btrfs_test_opt(fs_info, NOSPACECACHE)) + return; - *subvol_objectid = subvolid; - break; - default: - break; - } - } + /* + * At this point we don't have explicit options set by the user, set + * them ourselves based on the state of the file system. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); +} -out: - kfree(orig); - return error; +static void set_device_specific_options(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) + btrfs_set_opt(fs_info->mount_opt, SSD); + + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) + btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); } char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, @@ -1131,8 +949,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1148,10 +965,6 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; -#endif - sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1160,13 +973,13 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { - btrfs_err(fs_info, "open_ctree failed"); + btrfs_err(fs_info, "open_ctree failed: %d", err); return err; } - inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); + inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); btrfs_handle_fs_error(fs_info, err, NULL); @@ -1200,7 +1013,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1282,6 +1095,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "ignorebadroots", &printed); if (btrfs_test_opt(info, IGNOREDATACSUMS)) print_rescue_option(seq, "ignoredatacsums", &printed); + if (btrfs_test_opt(info, IGNOREMETACSUMS)) + print_rescue_option(seq, "ignoremetacsums", &printed); + if (btrfs_test_opt(info, IGNORESUPERFLAGS)) + print_rescue_option(seq, "ignoresuperflags", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) @@ -1308,15 +1125,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) - seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(info, CHECK_INTEGRITY)) - seq_puts(seq, ",check_int"); - if (info->check_integrity_print_mask) - seq_printf(seq, ",check_int_print_mask=%d", - info->check_integrity_print_mask); -#endif if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) @@ -1331,34 +1139,16 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); - seq_printf(seq, ",subvolid=%llu", - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; } -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - /* * subvolumes are identified by ino 256 */ @@ -1402,7 +1192,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, struct super_block *s = root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(s); struct inode *root_inode = d_inode(root); - u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root); ret = 0; if (!is_subvolume_inode(root_inode)) { @@ -1434,195 +1224,6 @@ out: return root; } -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on mount_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, - int flags, const char *device_name, void *data) -{ - struct block_device *bdev = NULL; - struct super_block *s; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - void *new_sec_opts = NULL; - blk_mode_t mode = sb_open_mode(flags); - int error = 0; - - if (data) { - error = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (error) - return ERR_PTR(error); - } - - /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * then open_ctree will properly initialize the file system specific - * settings later. btrfs_init_fs_info initializes the static elements - * of the fs_info (locks and such) to make cleanup easier if we find a - * superblock with our given fs_devices later on at sget() time. - */ - fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); - if (!fs_info) { - error = -ENOMEM; - goto error_sec_opts; - } - btrfs_init_fs_info(fs_info); - - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; - } - - mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode); - if (error) { - mutex_unlock(&uuid_mutex); - goto error_fs_info; - } - - device = btrfs_scan_one_device(device_name, mode); - if (IS_ERR(device)) { - mutex_unlock(&uuid_mutex); - error = PTR_ERR(device); - goto error_fs_info; - } - - fs_devices = device->fs_devices; - fs_info->fs_devices = fs_devices; - - error = btrfs_open_devices(fs_devices, mode, fs_type); - mutex_unlock(&uuid_mutex); - if (error) - goto error_fs_info; - - if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - - bdev = fs_devices->latest_dev->bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, - fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; - } - - if (s->s_root) { - btrfs_close_devices(fs_devices); - btrfs_free_fs_info(fs_info); - if ((flags ^ s->s_flags) & SB_RDONLY) - error = -EBUSY; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, - s->s_id); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data); - } - if (!error) - error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); - security_free_mnt_opts(&new_sec_opts); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - return dget(s->s_root); - -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - btrfs_free_fs_info(fs_info); -error_sec_opts: - security_free_mnt_opts(&new_sec_opts); - return ERR_PTR(error); -} - -/* - * Mount function which is called by VFS layer. - * - * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() - * which needs vfsmount* of device's root (/). This means device's root has to - * be mounted internally in any case. - * - * Operation flow: - * 1. Parse subvol id related options for later use in mount_subvol(). - * - * 2. Mount device's root (/) by calling vfs_kern_mount(). - * - * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the - * first place. In order to avoid calling btrfs_mount() again, we use - * different file_system_type which is not registered to VFS by - * register_filesystem() (btrfs_root_fs_type). As a result, - * btrfs_mount_root() is called. The return value will be used by - * mount_subtree() in mount_subvol(). - * - * 3. Call mount_subvol() to get the dentry of subvolume. Since there is - * "btrfs subvolume set-default", mount_subvol() is called always. - */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) -{ - struct vfsmount *mnt_root; - struct dentry *root; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - int error = 0; - - error = btrfs_parse_subvol_options(data, &subvol_name, - &subvol_objectid); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - /* mount device's root (/) */ - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); - if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags & ~SB_RDONLY, device_name, data); - } else { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags | SB_RDONLY, device_name, data); - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - down_write(&mnt_root->mnt_sb->s_umount); - error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); - up_write(&mnt_root->mnt_sb->s_umount); - if (error < 0) { - root = ERR_PTR(error); - mntput(mnt_root); - kfree(subvol_name); - goto out; - } - } - } - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, mnt_root); - -out: - return root; -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, u32 new_pool_size, u32 old_pool_size) { @@ -1645,7 +1246,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, - unsigned long old_opts, int flags) + unsigned long long old_opts, int flags) { if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -1659,7 +1260,7 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, } static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, - unsigned long old_opts) + unsigned long long old_opts) { const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); @@ -1685,202 +1286,283 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } -static int btrfs_remount(struct super_block *sb, int *flags, char *data) +static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u32 old_thread_pool_size = fs_info->thread_pool_size; - u32 old_metadata_ratio = fs_info->metadata_ratio; int ret; - sync_filesystem(sb); - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "remounting read-write after error is not allowed"); + return -EINVAL; + } - if (data) { - void *new_sec_opts = NULL; + if (fs_info->fs_devices->rw_devices == 0) + return -EACCES; - ret = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (!ret) - ret = security_sb_remount(sb, new_sec_opts); - security_free_mnt_opts(&new_sec_opts); - if (ret) - goto restore; + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + return -EACCES; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + return -EINVAL; } - ret = btrfs_parse_options(fs_info, data, *flags); + /* + * NOTE: when remounting with a change that does writes, don't put it + * anywhere above this point, as we are not sure to be safe to write + * until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) - goto restore; + return ret; - ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); - if (ret < 0) - goto restore; + btrfs_clear_sb_rdonly(fs_info->sb); - btrfs_remount_begin(fs_info, old_opts, *flags); - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); - if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != - (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { - btrfs_warn(fs_info, - "remount supports changing free space tree only from ro to rw"); - /* Make sure free space cache options match the state on disk */ - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - } - if (btrfs_free_space_cache_v1_active(fs_info)) { - btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); - } - } + /* + * If we've gone from readonly -> read-write, we need to get our + * sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) - goto out; + return 0; +} - if (*flags & SB_RDONLY) { - /* - * this also happens on 'umount -rf' or on shutdown, when - * the filesystem is busy. - */ - cancel_work_sync(&fs_info->async_reclaim_work); - cancel_work_sync(&fs_info->async_data_reclaim_work); +static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) +{ + /* + * This also happens on 'umount -rf' or on shutdown, when the + * filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); - btrfs_discard_cleanup(fs_info); + btrfs_discard_cleanup(fs_info); - /* wait for the uuid_scan task to finish */ - down(&fs_info->uuid_tree_rescan_sem); - /* avoid complains from lockdep et al. */ - up(&fs_info->uuid_tree_rescan_sem); + /* Wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* Avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); - btrfs_set_sb_rdonly(sb); + btrfs_set_sb_rdonly(fs_info->sb); - /* - * Setting SB_RDONLY will put the cleaner thread to - * sleep at the next loop if it's already active. - * If it's already asleep, we'll leave unused block - * groups on disk until we're mounted read-write again - * unless we clean them up here. - */ - btrfs_delete_unused_bgs(fs_info); + /* + * Setting SB_RDONLY will put the cleaner thread to sleep at the next + * loop if it's already active. If it's already asleep, we'll leave + * unused block groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); - /* - * The cleaner task could be already running before we set the - * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). - * We must make sure that after we finish the remount, i.e. after - * we call btrfs_commit_super(), the cleaner can no longer start - * a transaction - either because it was dropping a dead root, - * running delayed iputs or deleting an unused block group (the - * cleaner picked a block group from the list of unused block - * groups before we were able to in the previous call to - * btrfs_delete_unused_bgs()). - */ - wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, - TASK_UNINTERRUPTIBLE); + /* + * The cleaner task could be already running before we set the flag + * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make + * sure that after we finish the remount, i.e. after we call + * btrfs_commit_super(), the cleaner can no longer start a transaction + * - either because it was dropping a dead root, running delayed iputs + * or deleting an unused block group (the cleaner picked a block + * group from the list of unused block groups before we were able to + * in the previous call to btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); - /* - * We've set the superblock to RO mode, so we might have made - * the cleaner task sleep without running all pending delayed - * iputs. Go through all the delayed iputs here, so that if an - * unmount happens without remounting RW we don't end up at - * finishing close_ctree() with a non-empty list of delayed - * iputs. - */ - btrfs_run_delayed_iputs(fs_info); + /* + * We've set the superblock to RO mode, so we might have made the + * cleaner task sleep without running all pending delayed iputs. Go + * through all the delayed iputs here, so that if an unmount happens + * without remounting RW we don't end up at finishing close_ctree() + * with a non-empty list of delayed iputs. + */ + btrfs_run_delayed_iputs(fs_info); - btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(fs_info); - btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); - /* - * Pause the qgroup rescan worker if it is running. We don't want - * it to be still running after we are in RO mode, as after that, - * by the time we unmount, it might have left a transaction open, - * so we would leak the transaction and/or crash. - */ - btrfs_qgroup_wait_for_completion(fs_info, false); + /* + * Pause the qgroup rescan worker if it is running. We don't want it to + * be still running after we are in RO mode, as after that, by the time + * we unmount, it might have left a transaction open, so we would leak + * the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); - ret = btrfs_commit_super(fs_info); - if (ret) - goto restore; - } else { - if (BTRFS_FS_ERROR(fs_info)) { - btrfs_err(fs_info, - "Remounting read-write after error is not allowed"); - ret = -EINVAL; - goto restore; - } - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } + return btrfs_commit_super(fs_info); +} - if (!btrfs_check_rw_degradable(fs_info, NULL)) { - btrfs_warn(fs_info, - "too many missing devices, writable remount is not allowed"); - ret = -EACCES; - goto restore; - } +static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + fs_info->max_inline = ctx->max_inline; + fs_info->commit_interval = ctx->commit_interval; + fs_info->metadata_ratio = ctx->metadata_ratio; + fs_info->thread_pool_size = ctx->thread_pool_size; + fs_info->mount_opt = ctx->mount_opt; + fs_info->compress_type = ctx->compress_type; + fs_info->compress_level = ctx->compress_level; +} - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - btrfs_warn(fs_info, - "mount required to replay tree-log, cannot remount read-write"); - ret = -EINVAL; - goto restore; - } +static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + ctx->max_inline = fs_info->max_inline; + ctx->commit_interval = fs_info->commit_interval; + ctx->metadata_ratio = fs_info->metadata_ratio; + ctx->thread_pool_size = fs_info->thread_pool_size; + ctx->mount_opt = fs_info->mount_opt; + ctx->compress_type = fs_info->compress_type; + ctx->compress_level = fs_info->compress_level; +} - /* - * NOTE: when remounting with a change that does writes, don't - * put it anywhere above this point, as we are not sure to be - * safe to write until we pass the above checks. - */ - ret = btrfs_start_pre_rw_mount(fs_info); - if (ret) - goto restore; +#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old) +{ + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); + btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); + btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); + btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); + btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); + btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); + btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); + btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); + btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); + btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); + btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); + btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); + btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); + btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); + btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); + btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); + btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); + btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); + + btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); + btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); + btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); + btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); + btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); + btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); + btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); + + /* Did the compression settings change? */ + if (btrfs_test_opt(info, COMPRESS) && + (!old || + old->compress_type != info->compress_type || + old->compress_level != info->compress_level || + (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && + btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { + const char *compress_type = btrfs_compress_type2str(info->compress_type); + + btrfs_info(info, "%s %s compression, level %d", + btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", + compress_type, info->compress_level); + } + + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + btrfs_info(info, "max_inline set to %llu", info->max_inline); +} - btrfs_clear_sb_rdonly(sb); +static int btrfs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_context old_ctx; + int ret = 0; + bool mount_reconfigure = (fc->s_fs_info != NULL); - set_bit(BTRFS_FS_OPEN, &fs_info->flags); + btrfs_info_to_ctx(fs_info, &old_ctx); - /* - * If we've gone from readonly -> read/write, we need to get - * our sync/async discard lists in the right state. - */ - btrfs_discard_resume(fs_info); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; + + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); + if (ret < 0) + return ret; + + btrfs_ctx_to_info(fs_info, ctx); + btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); + btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, + old_ctx.thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from RO to RW"); + /* Make sure free space cache options match the state on disk. */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } } -out: + + ret = 0; + if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; + /* - * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, - * since the absence of the flag means it can be toggled off by remount. + * If we set the mask during the parameter parsing VFS would reject the + * remount. Here we can set the mask and the value will be updated + * appropriately. */ - *flags |= SB_I_VERSION; + if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) + fc->sb_flags_mask |= SB_POSIXACL; + btrfs_emit_options(fs_info, &old_ctx); wake_up_process(fs_info->transaction_kthread); - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; - restore: - /* We've hit an error - don't reset SB_RDONLY */ - if (sb_rdonly(sb)) - old_flags |= SB_RDONLY; - if (!(old_flags & SB_RDONLY)) - clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - btrfs_resize_thread_pool(fs_info, - old_thread_pool_size, fs_info->thread_pool_size); - fs_info->metadata_ratio = old_metadata_ratio; - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_ctx_to_info(fs_info, &old_ctx); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - return ret; } @@ -2133,14 +1815,288 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; - buf->f_fsid.val[1] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid; + buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32; + buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root); + + return 0; +} + +static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct btrfs_fs_info *p = fc->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_get_tree_super(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = fc->s_fs_info; + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_devices *fs_devices = NULL; + struct block_device *bdev; + struct btrfs_device *device; + struct super_block *sb; + blk_mode_t mode = btrfs_open_mode(fc); + int ret; + + btrfs_ctx_to_info(fs_info, ctx); + mutex_lock(&uuid_mutex); + + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(fc->source, mode, true); + ASSERT(device != NULL); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + return PTR_ERR(device); + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + + ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); + mutex_unlock(&uuid_mutex); + if (ret) + return ret; + + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + ret = -EACCES; + goto error; + } + + bdev = fs_devices->latest_dev->bdev; + + /* + * From now on the error handling is not straightforward. + * + * If successful, this will transfer the fs_info into the super block, + * and fc->s_fs_info will be NULL. However if there's an existing + * super, we'll still have fc->s_fs_info populated. If we error + * completely out it'll be cleaned up when we drop the fs_context, + * otherwise it's tied to the lifetime of the super_block. + */ + sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto error; + } + + set_device_specific_options(fs_info); + + if (sb->s_root) { + btrfs_close_devices(fs_devices); + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ + } else { + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); + btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; + ret = btrfs_fill_super(sb, fs_devices); + if (ret) { + deactivate_locked_super(sb); + return ret; + } + } + + btrfs_clear_oneshot_options(fs_info); + + fc->root = dget(sb->s_root); + return 0; + +error: + btrfs_close_devices(fs_devices); + return ret; +} + +/* + * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes + * with different ro/rw options") the following works: + * + * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo + * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar + * + * which looks nice and innocent but is actually pretty intricate and deserves + * a long comment. + * + * On another filesystem a subvolume mount is close to something like: + * + * (iii) # create rw superblock + initial mount + * mount -t xfs /dev/sdb /opt/ + * + * # create ro bind mount + * mount --bind -o ro /opt/foo /mnt/foo + * + * # unmount initial mount + * umount /opt + * + * Of course, there's some special subvolume sauce and there's the fact that the + * sb->s_root dentry is really swapped after mount_subtree(). But conceptually + * it's very close and will help us understand the issue. + * + * The old mount API didn't cleanly distinguish between a mount being made ro + * and a superblock being made ro. The only way to change the ro state of + * either object was by passing ms_rdonly. If a new mount was created via + * mount(2) such as: + * + * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); + * + * the MS_RDONLY flag being specified had two effects: + * + * (1) MNT_READONLY was raised -> the resulting mount got + * @mnt->mnt_flags |= MNT_READONLY raised. + * + * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems + * made the superblock ro. Note, how SB_RDONLY has the same value as + * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). + * + * Creating a subtree mount via (iii) ends up leaving a rw superblock with a + * subtree mounted ro. + * + * But consider the effect on the old mount API on btrfs subvolume mounting + * which combines the distinct step in (iii) into a single step. + * + * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) + * is issued the superblock is ro and thus even if the mount created for (ii) is + * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro + * to rw for (ii) which it did using an internal remount call. + * + * IOW, subvolume mounting was inherently complicated due to the ambiguity of + * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate + * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when + * passed by mount(8) to mount(2). + * + * Enter the new mount API. The new mount API disambiguates making a mount ro + * and making a superblock ro. + * + * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either + * fsmount() or mount_setattr() this is a pure VFS level change for a + * specific mount or mount tree that is never seen by the filesystem itself. + * + * (4) To turn a superblock ro the "ro" flag must be used with + * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem + * in fc->sb_flags. + * + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. + */ +static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) +{ + int ret = 0; + + if (fc->sb_flags & SB_RDONLY) + return ret; + + down_write(&mnt->mnt_sb->s_umount); + if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); + up_write(&mnt->mnt_sb->s_umount); + return ret; +} + +static int btrfs_get_tree_subvol(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_context *dup_fc; + struct dentry *dentry; + struct vfsmount *mnt; + int ret = 0; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + btrfs_free_fs_info(fs_info); + return -ENOMEM; + } + btrfs_init_fs_info(fs_info); + + dup_fc = vfs_dup_fs_context(fc); + if (IS_ERR(dup_fc)) { + btrfs_free_fs_info(fs_info); + return PTR_ERR(dup_fc); + } + + /* + * When we do the sget_fc this gets transferred to the sb, so we only + * need to set it on the dup_fc as this is what creates the super block. + */ + dup_fc->s_fs_info = fs_info; + + /* + * We'll do the security settings in our btrfs_get_tree_super() mount + * loop, they were duplicated into dup_fc, we can drop the originals + * here. + */ + security_free_mnt_opts(&fc->security); + fc->security = NULL; + mnt = fc_mount(dup_fc); + if (IS_ERR(mnt)) { + put_fs_context(dup_fc); + return PTR_ERR(mnt); + } + ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + put_fs_context(dup_fc); + if (ret) { + mntput(mnt); + return ret; + } + + /* + * This free's ->subvol_name, because if it isn't set we have to + * allocate a buffer to hold the subvol_name, so we just drop our + * reference to it here. + */ + dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); + ctx->subvol_name = NULL; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + fc->root = dentry; return 0; } +static int btrfs_get_tree(struct fs_context *fc) +{ + /* + * Since we use mount_subtree to mount the default/specified subvol, we + * have to do mounts in two steps. + * + * First pass through we call btrfs_get_tree_subvol(), this is just a + * wrapper around fc_mount() to call back into here again, and this time + * we'll call btrfs_get_tree_super(). This will do the open_ctree() and + * everything to open the devices and file system. Then we return back + * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and + * from there we can do our mount_subvol() call, which will lookup + * whichever subvol we're mounting and setup this fc with the + * appropriate dentry for the subvol. + */ + if (fc->s_fs_info) + return btrfs_get_tree_super(fc); + return btrfs_get_tree_subvol(fc); +} + static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -2148,22 +2104,85 @@ static void btrfs_kill_super(struct super_block *sb) btrfs_free_fs_info(fs_info); } -static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, -}; +static void btrfs_free_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_info *fs_info = fc->s_fs_info; + + if (fs_info) + btrfs_free_fs_info(fs_info); + + if (ctx && refcount_dec_and_test(&ctx->refs)) { + kfree(ctx->subvol_name); + kfree(ctx); + } +} + +static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) +{ + struct btrfs_fs_context *ctx = src_fc->fs_private; -static struct file_system_type btrfs_root_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount_root, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + /* + * Give a ref to our ctx to this dup, as we want to keep it around for + * our original fc so we can have the subvolume name or objectid. + * + * We unset ->source in the original fc because the dup needs it for + * mounting, and then once we free the dup it'll free ->source, so we + * need to make sure we're only pointing to it in one fc. + */ + refcount_inc(&ctx->refs); + fc->fs_private = ctx; + fc->source = src_fc->source; + src_fc->source = NULL; + return 0; +} + +static const struct fs_context_operations btrfs_fs_context_ops = { + .parse_param = btrfs_parse_param, + .reconfigure = btrfs_reconfigure, + .get_tree = btrfs_get_tree, + .dup = btrfs_dup_fs_context, + .free = btrfs_free_fs_context, }; +static int btrfs_init_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refs, 1); + fc->fs_private = ctx; + fc->ops = &btrfs_fs_context_ops; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); + } else { + ctx->thread_pool_size = + min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->sb_flags |= SB_I_VERSION; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .init_fs_context = btrfs_init_fs_context, + .parameters = btrfs_fs_parameters, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + }; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2194,12 +2213,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); - vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2213,8 +2238,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); - if (IS_ERR(device)) { + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); break; @@ -2228,15 +2257,14 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; } +out: kfree(vol); return ret; } static int btrfs_freeze(struct super_block *sb) { - struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* @@ -2245,20 +2273,14 @@ static int btrfs_freeze(struct super_block *sb) * we want to avoid on a frozen filesystem), or do the commit * ourselves. */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); - } - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u64 last_trans; u16 csum_type; int ret = 0; @@ -2294,10 +2316,10 @@ static int check_dev_super(struct btrfs_device *dev) if (ret < 0) goto out; - if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (btrfs_super_generation(sb) != last_trans) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", - btrfs_super_generation(sb), - fs_info->last_trans_committed); + btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; goto out; } @@ -2355,6 +2377,33 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) return 0; } +static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_count(fs_info, nr); + + /* + * Only report the real number for DEBUG builds, as there are reports of + * serious performance degradation caused by too frequent shrinks. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return nr; + return 0; +} + +static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_free_extent_maps(fs_info, nr_to_scan); + + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2366,9 +2415,10 @@ static const struct super_operations btrfs_super_ops = { .destroy_inode = btrfs_destroy_inode, .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, + .nr_cached_objects = btrfs_nr_cached_objects, + .free_cached_objects = btrfs_free_cached_objects, }; static const struct file_operations btrfs_ctl_fops = { @@ -2407,9 +2457,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - ", integrity-checker=on" -#endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif @@ -2459,6 +2506,9 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_init_cachep, .exit_func = btrfs_destroy_cachep, }, { + .init_func = btrfs_init_dio, + .exit_func = btrfs_destroy_dio, + }, { .init_func = btrfs_transaction_init, .exit_func = btrfs_transaction_exit, }, { @@ -2550,6 +2600,7 @@ static int __init init_btrfs_fs(void) late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) +MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: xxhash64"); diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 8dbb909b364f..d80a86acfbbe 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,11 +3,20 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); +#include <linux/types.h> +#include <linux/fs.h> +#include "fs.h" + +struct super_block; +struct btrfs_fs_info; + +bool btrfs_check_options(const struct btrfs_fs_info *info, + unsigned long long *mount_opt, + unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c9198723e4cb..5912d5057766 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +/* Remove once support for raid stripe tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), + BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), + BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -380,6 +385,8 @@ static const char *rescue_opts[] = { "nologreplay", "ignorebadroots", "ignoredatacsums", + "ignoremetacsums", + "ignoresuperflags", "all", }; @@ -416,10 +423,17 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { - return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); + return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); +static ssize_t temp_fsid_supported_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "0\n"); +} +BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); + /* * Features which only depend on kernel version. * @@ -433,6 +447,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), + BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; @@ -881,6 +896,9 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +SPACE_INFO_ATTR(reclaim_count); +SPACE_INFO_ATTR(reclaim_bytes); +SPACE_INFO_ATTR(reclaim_errors); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); @@ -889,8 +907,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; - return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + spin_lock(&space_info->lock); + ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); + spin_unlock(&space_info->lock); + return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -901,6 +923,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, int thresh; int ret; + if (READ_ONCE(space_info->dynamic_reclaim)) + return -EINVAL; + ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; @@ -917,6 +942,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); +static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); +} + +static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int dynamic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &dynamic_reclaim); + if (ret) + return ret; + + if (dynamic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, dynamic_reclaim, + btrfs_sinfo_dynamic_reclaim_show, + btrfs_sinfo_dynamic_reclaim_store); + +static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); +} + +static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int periodic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &periodic_reclaim); + if (ret) + return ret; + + if (periodic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, periodic_reclaim, + btrfs_sinfo_periodic_reclaim_show, + btrfs_sinfo_periodic_reclaim_store); + /* * Allocation information about block group types. * @@ -934,8 +1025,13 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), + BTRFS_ATTR_PTR(space_info, reclaim_count), + BTRFS_ATTR_PTR(space_info, reclaim_bytes), + BTRFS_ATTR_PTR(space_info, reclaim_errors), + BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif @@ -1022,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1032,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1084,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -1196,21 +1292,31 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); +static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); +} +BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (fs_devices->read_policy == i) + if (policy == i) ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); @@ -1234,8 +1340,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != fs_devices->read_policy) { - fs_devices->read_policy = i; + if (i != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, i); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[i]); @@ -1284,6 +1390,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +#ifdef CONFIG_BTRFS_DEBUG +static ssize_t btrfs_offload_csum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + + switch (READ_ONCE(fs_devices->offload_csum_mode)) { + case BTRFS_OFFLOAD_CSUM_AUTO: + return sysfs_emit(buf, "auto\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_ON: + return sysfs_emit(buf, "1\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_OFF: + return sysfs_emit(buf, "0\n"); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static ssize_t btrfs_offload_csum_store(struct kobject *kobj, + struct kobj_attribute *a, const char *buf, + size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret == 0) + WRITE_ONCE(fs_devices->offload_csum_mode, + val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); + else if (ret == -EINVAL && sysfs_streq(buf, "auto")) + WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); + else + return -EINVAL; + + return len; +} +BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); +#endif + /* * Per-filesystem information and stats. * @@ -1302,6 +1449,10 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), + BTRFS_ATTR_PTR(, temp_fsid), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(, offload_csum), +#endif NULL, }; @@ -2090,6 +2241,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); +static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + ssize_t ret = 0; + + spin_lock(&fs_info->qgroup_lock); + ASSERT(btrfs_qgroup_enabled(fs_info)); + switch (btrfs_qgroup_mode(fs_info)) { + case BTRFS_QGROUP_MODE_FULL: + ret = sysfs_emit(buf, "qgroup\n"); + break; + case BTRFS_QGROUP_MODE_SIMPLE: + ret = sysfs_emit(buf, "squota\n"); + break; + default: + btrfs_warn(fs_info, "unexpected qgroup mode %d\n", + btrfs_qgroup_mode(fs_info)); + break; + } + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} +BTRFS_ATTR(qgroups, mode, qgroup_mode_show); + static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) @@ -2152,6 +2330,7 @@ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); @@ -2243,7 +2422,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; @@ -2264,7 +2443,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, @@ -2285,7 +2464,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *next; int ret = 0; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); @@ -2317,7 +2496,7 @@ out: void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 86c7eef12873..e6a284c59809 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,8 +3,17 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H +#include <linux/types.h> +#include <linux/compiler_types.h> #include <linux/kobject.h> +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_block_group; +struct btrfs_space_info; +struct btrfs_qgroup; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ca09cf9afce8..ce50847e1e01 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -28,6 +28,7 @@ const char *test_error[] = { [TEST_ALLOC_INODE] = "cannot allocate inode", [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", + [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", }; static const struct super_operations btrfs_test_super_ops = { @@ -60,10 +61,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; - inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_FIRST_FREE_OBJECTID); inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); return inode; @@ -102,7 +100,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0); + extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -159,8 +157,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) if (!fs_info) return; - if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) + if (WARN_ON(!btrfs_is_testing(fs_info))) return; test_mnt->mnt_sb->s_fs_info = NULL; @@ -185,7 +182,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, dev_list) { btrfs_free_dummy_device(dev); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 7a2d7ffbe30e..dc2f2ab15fa5 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,6 +23,7 @@ enum { TEST_ALLOC_INODE, TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, + TEST_ALLOC_CHUNK_MAP, }; extern const char *test_error[]; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f4fd3fb7c887..de226209220f 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,9 +14,9 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) @@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -672,7 +672,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < eb->len; i++) { - struct page *page = eb->pages[i >> PAGE_SHIFT]; + struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0); void *addr = page_address(page) + offset_in_page(i); if (memcmp(addr, memory + i, 1) != 0) { @@ -688,7 +688,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { - void *eb_addr = page_address(eb->pages[i]); + void *eb_addr = folio_address(eb->folios[i]); if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { dump_eb_and_memory_contents(eb, memory, test_name); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index bf85c75ee722..609bb6c9c087 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -11,23 +11,27 @@ #include "../disk-io.h" #include "../block-group.h" -static void free_extent_map_tree(struct extent_map_tree *em_tree) +static int free_extent_map_tree(struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct rb_node *node; + int ret = 0; write_lock(&em_tree->lock); - while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { - node = rb_first_cached(&em_tree->map); + while (!RB_EMPTY_ROOT(&em_tree->root)) { + node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { + ret = -EINVAL; test_err( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", - em->start, em->len, em->block_start, - em->block_len, refcount_read(&em->refs)); +"em leak: em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu offset %llu) refs %d", + em->start, em->len, em->disk_bytenr, + em->disk_num_bytes, em->offset, + refcount_read(&em->refs)); refcount_set(&em->refs, 1); } @@ -35,6 +39,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) free_extent_map(em); } write_unlock(&em_tree->lock); + + return ret; } /* @@ -53,13 +59,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static int test_case_1(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 start = 0; u64 len = SZ_8K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -70,10 +77,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_16K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -91,10 +99,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->start = SZ_16K; em->len = SZ_4K; - em->block_start = SZ_32K; /* avoid merging */ - em->block_len = SZ_4K; + em->disk_bytenr = SZ_32K; /* avoid merging */ + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -112,27 +121,35 @@ static int test_case_1(struct btrfs_fs_info *fs_info, /* Add [0, 8K), should return [0, 16K) instead. */ em->start = start; em->len = len; - em->block_start = start; - em->block_len = len; + em->disk_bytenr = start; + em->disk_num_bytes = len; + em->ram_bytes = len; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) { + if (!em) { + test_err("case1 [%llu %llu]: no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_16K || + em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( -"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", start, start + len, ret, em->start, em->len, - em->block_start, em->block_len); + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -143,11 +160,12 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static int test_case_2(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -158,10 +176,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info, /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; - em->block_start = EXTENT_MAP_INLINE; - em->block_len = (u64)-1; + em->disk_bytenr = EXTENT_MAP_INLINE; + em->disk_num_bytes = 0; + em->ram_bytes = SZ_1K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -179,10 +198,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_4K; - em->block_len = SZ_4K; + em->disk_bytenr = SZ_4K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -200,37 +220,45 @@ static int test_case_2(struct btrfs_fs_info *fs_info, /* Add [0, 1K) */ em->start = 0; em->len = SZ_1K; - em->block_start = EXTENT_MAP_INLINE; - em->block_len = (u64)-1; + em->disk_bytenr = EXTENT_MAP_INLINE; + em->disk_num_bytes = 0; + em->ram_bytes = SZ_1K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { + if (!em) { + test_err("case2 [0 1K]: no extent map returned"); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_1K || + em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( -"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", - ret, em->start, em->len, em->block_start, - em->block_len); +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", + ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } static int __test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -241,10 +269,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, /* Add [4K, 8K) */ em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_4K; - em->block_len = SZ_4K; + em->disk_bytenr = SZ_4K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -262,32 +291,40 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, /* Add [0, 16K) */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_16K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case3 [0x%llx 0x%llx): ret %d", + test_err("case3 [%llu %llu): ret %d", start, start + len, ret); goto out; } + if (!em) { + test_err("case3 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (em && - (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) { + if (start < em->start || start + len > extent_map_end(em) || + em->start != extent_map_block_start(em)) { test_err( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", +"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, - em->block_start, em->block_len); + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -308,28 +345,29 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static int test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_3(fs_info, em_tree, 0); + ret = __test_case_3(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, SZ_8K); + ret = __test_case_3(fs_info, inode, SZ_8K); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + ret = __test_case_3(fs_info, inode, (12 * SZ_1K)); return ret; } static int __test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -340,10 +378,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [0K, 8K) */ em->start = 0; em->len = SZ_8K; - em->block_start = 0; - em->block_len = SZ_8K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_8K; + em->ram_bytes = SZ_8K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -361,10 +400,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [8K, 32K) */ em->start = SZ_8K; em->len = 24 * SZ_1K; - em->block_start = SZ_16K; /* avoid merging */ - em->block_len = 24 * SZ_1K; + em->disk_bytenr = SZ_16K; /* avoid merging */ + em->disk_num_bytes = 24 * SZ_1K; + em->ram_bytes = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -381,26 +421,35 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, /* Add [0K, 32K) */ em->start = 0; em->len = SZ_32K; - em->block_start = 0; - em->block_len = SZ_32K; + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_32K; + em->ram_bytes = SZ_32K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case4 [0x%llx 0x%llx): ret %d", - start, len, ret); + test_err("case4 [%llu %llu): ret %d", + start, start + len, ret); + goto out; + } + if (!em) { + test_err("case4 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; goto out; } - if (em && (start < em->start || start + len > extent_map_end(em))) { + if (start < em->start || start + len > extent_map_end(em)) { test_err( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", - start, len, ret, em->start, em->len, em->block_start, - em->block_len); +"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", + start, start + len, ret, em->start, em->len, + em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -430,22 +479,22 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static int test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_4(fs_info, em_tree, 0); + ret = __test_case_4(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_4(fs_info, em_tree, SZ_4K); + ret = __test_case_4(fs_info, inode, SZ_4K); return ret; } -static int add_compressed_extent(struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_inode *inode, u64 start, u64 len, u64 block_start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -457,11 +506,12 @@ static int add_compressed_extent(struct extent_map_tree *em_tree, em->start = start; em->len = len; - em->block_start = block_start; - em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->disk_bytenr = block_start; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = len; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -513,7 +563,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index) struct rb_node *n; int i; - for (i = 0, n = rb_first_cached(&em_tree->map); + for (i = 0, n = rb_first(&em_tree->root); valid_ranges[index][i].len && n; i++, n = rb_next(n)) { struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); @@ -567,53 +617,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(void) +static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; - struct inode *inode; u64 start, end; int ret; + int ret2; test_msg("Running btrfs_drop_extent_map_range tests"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - /* [0, 12k) */ - ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -622,36 +663,39 @@ static int test_case_5(void) /* Drop [8k, 12k) */ start = SZ_8K; end = (3 * SZ_4K) - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 0); if (ret) goto out; /* Drop [12k, 20k) */ start = SZ_4K * 3; end = SZ_16K + SZ_4K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 1); if (ret) goto out; /* Drop [28k, 32k) */ start = SZ_32K - SZ_4K; end = SZ_32K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 2); if (ret) goto out; /* Drop [32k, 64k) */ start = SZ_32K; end = SZ_64K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 3); if (ret) goto out; out: - iput(inode); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -660,31 +704,35 @@ out: * for areas between two existing ems. Validate it doesn't do this when there * are two unmerged em's side by side. */ -static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em = NULL; int ret; + int ret2; - ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(inode, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0); if (ret) goto out; em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; + ret = -ENOMEM; + goto out; } em->start = SZ_4K; em->len = SZ_4K; - em->block_start = SZ_16K; - em->block_len = SZ_16K; + em->disk_bytenr = SZ_16K; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K); write_unlock(&em_tree->lock); if (ret != 0) { @@ -704,7 +752,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em ret = 0; out: free_extent_map(em); - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -713,38 +764,30 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(void) +static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; - struct inode *inode; int ret; + int ret2; test_msg("Running btrfs_drop_extent_cache with pinned"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - ret = -ENOMEM; - goto out; + return -ENOMEM; } /* [0, 16K), pinned */ em->start = 0; em->len = SZ_16K; - em->block_start = 0; - em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->disk_bytenr = 0; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_16K; + em->flags |= (EXTENT_FLAG_PINNED | EXTENT_FLAG_COMPRESS_ZLIB); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -762,10 +805,11 @@ static int test_case_7(void) /* [32K, 48K), not pinned */ em->start = SZ_32K; em->len = SZ_16K; - em->block_start = SZ_32K; - em->block_len = SZ_16K; + em->disk_bytenr = SZ_32K; + em->disk_num_bytes = SZ_16K; + em->ram_bytes = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -777,7 +821,7 @@ static int test_case_7(void) * Drop [0, 36K) This should skip the [0, 4K) extent and then split the * [32K, 48K) extent. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true); /* Make sure our extent maps look sane. */ ret = -EINVAL; @@ -826,8 +870,9 @@ static int test_case_7(void) goto out; } - if (em->block_start != SZ_32K + SZ_4K) { - test_err("em->block_start is %llu, expected 36K", em->block_start); + if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + test_err("em->block_start is %llu, expected 36K", + extent_map_block_start(em)); goto out; } @@ -844,7 +889,110 @@ static int test_case_7(void) ret = 0; out: free_extent_map(em); - iput(inode); + /* Unpin our extent to prevent warning when removing it below. */ + ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + if (ret == 0) + ret = ret2; + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + + return ret; +} + +/* + * Test a regression for compressed extent map adjustment when we attempt to + * add an extent map that is partially overlapped by another existing extent + * map. The resulting extent map offset was left unchanged despite having + * incremented its start offset. + */ +static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + int ret; + int ret2; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Compressed extent for the file range [120K, 128K). */ + em->start = SZ_1K * 120; + em->len = SZ_8K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_8K; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [120K, 128K)"); + goto out; + } + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* + * Compressed extent for the file range [108K, 144K), which overlaps + * with the [120K, 128K) we previously inserted. + */ + em->start = SZ_1K * 108; + em->len = SZ_1K * 36; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_1K * 36; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + + /* + * Try to add the extent map but with a search range of [140K, 144K), + * this should succeed and adjust the extent map to the range + * [128K, 144K), with a length of 16K and an offset of 20K. + * + * This simulates a scenario where in the subvolume tree of an inode we + * have a compressed file extent item for the range [108K, 144K) and we + * have an overlapping compressed extent map for the range [120K, 128K), + * which was created by an encoded write, but its ordered extent was not + * yet completed, so the subvolume tree doesn't have yet the file extent + * item for that range - we only have the extent map in the inode's + * extent map tree. + */ + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [108K, 144K)"); + goto out; + } + + if (em->start != SZ_128K) { + test_err("unexpected extent map start %llu (should be 128K)", em->start); + ret = -EINVAL; + goto out; + } + if (em->len != SZ_16K) { + test_err("unexpected extent map length %llu (should be 16K)", em->len); + ret = -EINVAL; + goto out; + } + if (em->offset != SZ_1K * 20) { + test_err("unexpected extent map offset %llu (should be 20K)", em->offset); + ret = -EINVAL; + goto out; + } +out: + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -864,33 +1012,21 @@ struct rmap_test_vector { static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { - struct extent_map *em; - struct map_lookup *map = NULL; + struct btrfs_chunk_map *map; u64 *logical = NULL; int i, out_ndaddrs, out_stripe_len; int ret; - em = alloc_extent_map(); - if (!em) { - test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; - } - - map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); if (!map) { - kfree(em); - test_std_err(TEST_ALLOC_EXTENT_MAP); + test_std_err(TEST_ALLOC_CHUNK_MAP); return -ENOMEM; } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); /* Start at 4GiB logical address */ - em->start = SZ_4G; - em->len = test->data_stripe_size * test->num_data_stripes; - em->block_len = em->len; - em->orig_block_len = test->data_stripe_size; - em->map_lookup = map; - + map->start = SZ_4G; + map->chunk_len = test->data_stripe_size * test->num_data_stripes; + map->stripe_size = test->data_stripe_size; map->num_stripes = test->num_stripes; map->type = test->raid_type; @@ -906,15 +1042,14 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, map->stripes[i].physical = test->data_stripe_phys_start[i]; } - write_lock(&fs_info->mapping_tree.lock); - ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); - write_unlock(&fs_info->mapping_tree.lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret) { - test_err("error adding block group mapping to mapping tree"); + test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", @@ -943,14 +1078,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: - write_lock(&fs_info->mapping_tree.lock); - remove_extent_mapping(&fs_info->mapping_tree, em); - write_unlock(&fs_info->mapping_tree.lock); - /* For us */ - free_extent_map(em); + btrfs_remove_chunk_map(fs_info, map); out_free: - /* For the tree */ - free_extent_map(em); kfree(logical); return ret; } @@ -958,7 +1087,8 @@ out_free: int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; - struct extent_map_tree *em_tree; + struct inode *inode; + struct btrfs_root *root = NULL; int ret = 0, i; struct rmap_test_vector rmap_tests[] = { { @@ -1007,33 +1137,45 @@ int btrfs_test_extent_map(void) return -ENOMEM; } - em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); - if (!em_tree) { + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); ret = -ENOMEM; goto out; } - extent_map_tree_init(em_tree); + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + root = NULL; + goto out; + } + + BTRFS_I(inode)->root = root; - ret = test_case_1(fs_info, em_tree); + ret = test_case_1(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_2(fs_info, em_tree); + ret = test_case_2(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_3(fs_info, em_tree); + ret = test_case_3(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_4(fs_info, em_tree); + ret = test_case_4(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_5(); + ret = test_case_5(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_6(fs_info, em_tree); + ret = test_case_6(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_7(); + ret = test_case_7(fs_info, BTRFS_I(inode)); + if (ret) + goto out; + ret = test_case_8(fs_info, BTRFS_I(inode)); if (ret) goto out; @@ -1045,7 +1187,8 @@ int btrfs_test_extent_map(void) } out: - kfree(em_tree); + iput(inode); + btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 492d69d2fa73..3ea3bc2225fe 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -117,7 +117,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) /* Now for a regular extent */ insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, - disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); + disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; disk_bytenr += sectorsize; offset += sectorsize - 1; @@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } -static unsigned long prealloc_only = 0; -static unsigned long compressed_only = 0; -static unsigned long vacancy_only = 0; +static u32 prealloc_only = 0; +static u32 compressed_only = 0; +static u32 vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { @@ -258,14 +258,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } free_extent_map(em); @@ -278,13 +278,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_INLINE) { - test_err("expected an inline, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_INLINE) { + test_err("expected an inline, got %llu", em->disk_bytenr); goto out; } @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } /* @@ -316,13 +316,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 4) { @@ -332,20 +332,20 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize - 1) { @@ -355,25 +355,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -383,26 +382,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -412,19 +410,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -434,31 +432,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", - orig_start, em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } disk_bytenr += (em->start - orig_start); - if (em->block_start != disk_bytenr) { + if (extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, em->block_start); + disk_bytenr, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -468,26 +466,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -497,27 +494,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_HOLE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_HOLE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -527,30 +523,29 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("unexpected orig offset, wanted %llu, have %llu", - orig_start, em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("unexpected offset, wanted %llu, have %llu", + em->start - orig_start, em->offset); goto out; } - if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + if (extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + (em->start - em->orig_start), - em->block_start); + disk_bytenr + em->offset, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -560,32 +555,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", orig_start, - em->orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } - if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + if (extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + (em->start - em->orig_start), - em->block_start); + disk_bytenr + em->offset, extent_map_block_start(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -595,31 +589,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -629,32 +622,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } - disk_bytenr = em->block_start; + disk_bytenr = extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -664,25 +656,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != disk_bytenr) { + if (extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, em->block_start); + disk_bytenr, extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -692,31 +683,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } - if (em->orig_start != orig_start) { - test_err("wrong orig offset, want %llu, have %llu", - em->start, orig_start); + if (em->start - em->offset != orig_start) { + test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu", + em->start, em->offset, orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -726,24 +717,23 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole extent, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole extent, got %llu", em->disk_bytenr); goto out; } /* @@ -758,25 +748,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", vacancy_only, em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong offset, want 0, have %llu", em->offset); goto out; } offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_err("expected a real extent, got %llu", em->block_start); + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { + test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } if (em->start != offset || em->len != sectorsize) { @@ -786,12 +775,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - if (em->orig_start != em->start) { - test_err("wrong orig offset, want %llu, have %llu", em->start, - em->orig_start); + if (em->offset != 0) { + test_err("wrong orig offset, want 0, have %llu", em->offset); goto out; } ret = 0; @@ -850,13 +838,13 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); + if (em->disk_bytenr != EXTENT_MAP_HOLE) { + test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } if (em->start != 0 || em->len != sectorsize) { @@ -866,19 +854,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("wrong flags, wanted %lu, have %lu", vacancy_only, + test_err("wrong flags, wanted %u, have %u", vacancy_only, em->flags); goto out; } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != sectorsize) { - test_err("expected a real extent, got %llu", em->block_start); + if (extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { @@ -888,7 +876,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, wanted 0 got %lu", + test_err("unexpected flags set, wanted 0 got %u", em->flags); goto out; } @@ -1095,8 +1083,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) test_msg("running inode tests"); - set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB; + prealloc_only |= EXTENT_FLAG_PREALLOC; ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0548072c642f..dbef80cd5a9f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,12 +23,10 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" -#include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" @@ -145,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -164,7 +161,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -278,8 +281,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); @@ -353,7 +358,7 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + xa_init(&cur_trans->delayed_refs.dirty_extents); atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* @@ -384,7 +389,7 @@ loop: IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); - fs_info->generation++; + btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; @@ -407,7 +412,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - root->last_trans < trans->transid) || force) { + btrfs_get_root_last_trans(root) < trans->transid) || force) { WARN_ON(!force && root->commit_root != root->node); /* @@ -423,15 +428,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid && !force) { + if (btrfs_get_root_last_trans(root) == trans->transid && !force) { spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } radix_tree_tag_set(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); - root->last_trans = trans->transid; + btrfs_set_root_last_trans(root, trans->transid); /* this is pretty tricky. We don't want to * take the relocation lock in btrfs_record_root_in_trans @@ -474,7 +479,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, /* Make sure we don't try to update the root at commit time */ spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); } @@ -493,7 +498,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, * and barriers */ smp_rmb(); - if (root->last_trans == trans->transid && + if (btrfs_get_root_last_trans(root) == trans->transid && !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; @@ -552,13 +557,42 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; return true; } +static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush, + u64 num_bytes, + u64 *delayed_refs_bytes) +{ + struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; + u64 bytes = num_bytes + *delayed_refs_bytes; + int ret; + + /* + * We want to reserve all the bytes we may need all at once, so we only + * do 1 enospc flushing cycle per transaction start. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + + /* + * If we are an emergency flush, which can steal from the global block + * reserve, then attempt to not reserve space for the delayed refs, as + * we will consume space for them from the global block reserve. + */ + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + bytes -= *delayed_refs_bytes; + *delayed_refs_bytes = 0; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + } + + return ret; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, @@ -566,10 +600,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; + u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; @@ -592,9 +628,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { - struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; - u64 delayed_refs_bytes = 0; - qgroup_reserved = num_items * fs_info->nodesize; /* * Use prealloc for now, as there might be a currently running @@ -606,20 +639,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (ret) return ERR_PTR(ret); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* - * We want to reserve all the bytes we may need all at once, so - * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do num_items worth - * of delayed refs updates in this trans handle, and refill that - * amount for whatever is missing in the reserve. + * If we plan to insert/update/delete "num_items" from a btree, + * we will also generate delayed refs for extent buffers in the + * respective btree paths, so reserve space for the delayed refs + * that will be generated by the caller as it modifies btrees. + * Try to reserve them to avoid excessive use of the global + * block reserve. */ - num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - num_items); - num_bytes += delayed_refs_bytes; - } + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation @@ -629,16 +658,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, + &delayed_refs_bytes); if (ret) goto reserve_fail; - if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes); - num_bytes -= delayed_refs_bytes; - } - btrfs_block_rsv_add_bytes(rsv, num_bytes, true); - if (rsv->space_info->force_alloc) + btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); + + if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { @@ -698,6 +725,7 @@ again: h->type = type; INIT_LIST_HEAD(&h->new_bgs); + btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && @@ -710,8 +738,17 @@ again: if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); - h->block_rsv = &fs_info->trans_block_rsv; + h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; + if (delayed_refs_bytes > 0) { + trace_btrfs_space_reservation(fs_info, + "local_delayed_refs_rsv", + h->transid, + delayed_refs_bytes, 1); + h->delayed_refs_bytes_reserved = delayed_refs_bytes; + btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); + delayed_refs_bytes = 0; + } h->reloc_reserved = reloc_reserved; } @@ -766,8 +803,10 @@ join_fail: kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) - btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes, NULL); + btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); + if (delayed_refs_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, + delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); @@ -814,7 +853,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo } /* - * btrfs_attach_transaction() - catch the running transaction + * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. @@ -833,7 +872,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction_barrier() - catch the running transaction + * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully @@ -909,7 +948,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) int ret = 0; if (transid) { - if (transid <= fs_info->last_trans_committed) + if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ @@ -933,7 +972,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) * raced with btrfs_commit_transaction */ if (!cur_trans) { - if (transid > fs_info->last_trans_committed) + if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } @@ -988,11 +1027,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); + ASSERT(!trans->delayed_refs_bytes_reserved); return; } - if (!trans->bytes_reserved) + if (!trans->bytes_reserved) { + ASSERT(!trans->delayed_refs_bytes_reserved); return; + } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", @@ -1000,6 +1042,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; + + if (!trans->delayed_refs_bytes_reserved) + return; + + trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", + trans->transid, + trans->delayed_refs_bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, + trans->delayed_refs_bytes_reserved, NULL); + trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -1007,7 +1059,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int err = 0; + int ret = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -1046,13 +1098,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) - err = trans->aborted; + ret = trans->aborted; else - err = -EROFS; + ret = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); - return err; + return ret; } int btrfs_end_transaction(struct btrfs_trans_handle *trans) @@ -1073,8 +1125,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { - int err = 0; - int werr = 0; + int ret = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; @@ -1084,7 +1135,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, mark, &cached_state)) { bool wait_writeback = false; - err = convert_extent_bit(dirty_pages, start, end, + ret = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, &cached_state); /* @@ -1100,22 +1151,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * We cleanup any entries left in the io tree when committing * the transaction (through extent_io_tree_release()). */ - if (err == -ENOMEM) { - err = 0; + if (ret == -ENOMEM) { + ret = 0; wait_writeback = true; } - if (!err) - err = filemap_fdatawrite_range(mapping, start, end); - if (err) - werr = err; - else if (wait_writeback) - werr = filemap_fdatawait_range(mapping, start, end); + if (!ret) + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && wait_writeback) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - return werr; + return ret; } /* @@ -1127,12 +1178,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - int err = 0; - int werr = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + int ret = 0; while (find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -1144,22 +1194,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - err = clear_extent_bit(dirty_pages, start, end, + ret = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, &cached_state); - if (err == -ENOMEM) - err = 0; - if (!err) - err = filemap_fdatawait_range(mapping, start, end); - if (err) - werr = err; + if (ret == -ENOMEM) + ret = 0; + if (!ret) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; - return werr; + return ret; } static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, @@ -1184,7 +1232,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int err; - ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY) && @@ -1331,7 +1379,7 @@ again: } /* Now flush any delayed refs generated by updating all of the roots */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; @@ -1346,7 +1394,7 @@ again: * so we want to keep this flushing in this loop to make sure * everything gets run. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } @@ -1447,7 +1495,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) ASSERT(atomic_read(&root->log_commit[1]) == 0); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1481,45 +1529,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) } /* - * defrag a given btree. - * Every leaf in the btree is read and defragged. - */ -int btrfs_defrag_root(struct btrfs_root *root) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_trans_handle *trans; - int ret; - - if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) - return 0; - - while (1) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - - ret = btrfs_defrag_leaves(trans, root); - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(info); - cond_resched(); - - if (btrfs_fs_closing(info) || ret != -EAGAIN) - break; - - if (btrfs_defrag_cancelled(info)) { - btrfs_debug(info, "defrag_root cancelled"); - ret = -EAGAIN; - break; - } - } - clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); - return ret; -} - -/* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit @@ -1536,11 +1545,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, int ret; /* - * Save some performance in the case that qgroups are not - * enabled. If this check races with the ioctl, rescan will - * kick in anyway. + * Save some performance in the case that qgroups are not enabled. If + * this check races with the ioctl, rescan will kick in anyway. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* @@ -1564,7 +1572,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1578,8 +1586,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ - ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - inherit); + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, + btrfs_root_id(parent), inherit); if (ret < 0) goto out; @@ -1636,7 +1644,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = pending->dir; + struct inode *parent_inode = &pending->dir->vfs_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1729,6 +1737,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + ret = btrfs_create_qgroup(trans, objectid); + if (ret && ret != -EEXIST) { + if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + } + /* * pull in the delayed directory update * and the delayed inode item @@ -1811,7 +1827,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert root back/forward references */ ret = btrfs_add_root_ref(trans, objectid, - parent_root->root_key.objectid, + btrfs_root_id(parent_root), btrfs_ino(BTRFS_I(parent_inode)), index, &fname.disk_name); if (ret) { @@ -1840,16 +1856,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ - ret = qgroup_account_snapshot(trans, root, parent_root, - pending->inherit, objectid); + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, + btrfs_root_id(parent_root), pending->inherit); if (ret < 0) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); - /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1857,8 +1875,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = inode_set_ctime_current(parent_inode); - ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(parent_inode, + inode_set_ctime_current(parent_inode)); + ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1940,19 +1959,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->uuid_tree_generation = root_item->generation; } -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - struct btrfs_transaction *trans; - int ret = 0; - - spin_lock(&info->trans_lock); - trans = info->running_transaction; - if (trans) - ret = (trans->state >= TRANS_STATE_COMMIT_START); - spin_unlock(&info->trans_lock); - return ret; -} - int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; @@ -1992,6 +1998,25 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_put_transaction(cur_trans); } +/* + * If there is a running transaction commit it or if it's already committing, + * wait for its commit to complete. Does not start and commit a new transaction + * if there isn't any running. + */ +int btrfs_commit_current_transaction(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + int ret = PTR_ERR(trans); + + return (ret == -ENOENT) ? 0 : ret; + } + + return btrfs_commit_transaction(trans); +} + static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2081,8 +2106,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); } } @@ -2113,7 +2144,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } /* @@ -2400,7 +2431,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; @@ -2533,7 +2564,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); - fs_info->last_trans_committed = cur_trans->transid; + btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2622,7 +2653,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); + btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root)); btrfs_kill_all_delayed_nodes(root); @@ -2651,25 +2682,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit) + unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) + WRITE_ONCE(trans->aborted, error); + WRITE_ONCE(trans->transaction->aborted, error); + if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7623db359881..dd9ce9b9f69e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -6,12 +6,27 @@ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H +#include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/time64.h> +#include <linux/mutex.h> +#include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "ctree.h" +#include "extent-io-tree.h" +#include "block-rsv.h" +#include "messages.h" #include "misc.h" +struct dentry; +struct inode; +struct btrfs_pending_snapshot; +struct btrfs_fs_info; +struct btrfs_root_item; +struct btrfs_root; +struct btrfs_path; + /* * Signal that a direct IO write is in progress, to avoid deadlock for sync * direct IO writes when fsync is called during the direct IO write path. @@ -127,8 +142,10 @@ enum { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; + unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; @@ -148,6 +165,7 @@ struct btrfs_trans_handle { bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; + struct btrfs_block_rsv delayed_rsv; }; /* @@ -160,7 +178,7 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; - struct inode *dir; + struct btrfs_inode *dir; struct btrfs_root *root; struct btrfs_root_item *root_item; struct btrfs_root *snap; @@ -181,7 +199,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; - inode->last_sub_trans = inode->root->log_transid; + inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } @@ -209,32 +227,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int errno); +bool __cold abort_should_print_stack(int error); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ -#define btrfs_abort_transaction(trans, errno) \ +#define btrfs_abort_transaction(trans, error) \ do { \ - bool first = false; \ + bool __first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ + __first = true; \ + if (WARN(abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ + (error))) { \ /* Stack trace printed. */ \ } else { \ btrfs_err((trans)->fs_info, \ "Transaction aborted (error %d)", \ - (errno)); \ + (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ + __LINE__, (error), __first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); @@ -252,11 +270,11 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +int btrfs_commit_current_transaction(struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); @@ -266,14 +284,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit); + unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 53c74010140e..ffa5b83d3a4a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -21,7 +21,6 @@ #include "messages.h" #include "ctree.h" #include "tree-checker.h" -#include "disk-io.h" #include "compression.h" #include "volumes.h" #include "misc.h" @@ -29,6 +28,7 @@ #include "accessors.h" #include "file-item.h" #include "inode-item.h" +#include "dir-item.h" #include "extent-tree.h" /* @@ -65,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -92,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -152,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -337,6 +340,24 @@ static int check_extent_data_item(struct extent_buffer *leaf, } } + /* + * For non-compressed data extents, ram_bytes should match its + * disk_num_bytes. + * However we do not really utilize ram_bytes in this case, so this check + * is only optional for DEBUG builds for developers to catch the + * unexpected behaviors. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG) && + btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE && + btrfs_file_extent_disk_bytenr(leaf, fi)) { + if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) != + btrfs_file_extent_disk_num_bytes(leaf, fi))) + file_extent_err(leaf, slot, +"mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent", + btrfs_file_extent_ram_bytes(leaf, fi), + btrfs_file_extent_disk_num_bytes(leaf, fi)); + } + return 0; } @@ -614,7 +635,7 @@ static int check_dir_item(struct extent_buffer *leaf, */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { - char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); @@ -648,6 +669,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1004,6 +1026,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1259,6 +1282,7 @@ static void extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1527,6 +1551,9 @@ static int check_extent_item(struct extent_buffer *leaf, } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); @@ -1743,6 +1770,25 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) +{ + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for raid stripe extent, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) { + generic_err(leaf, slot, + "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset"); + return -EUCLEAN; + } + + return 0; +} + static int check_dev_extent_item(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot, @@ -1866,6 +1912,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: ret = check_extent_data_ref(leaf, key, slot); break; + case BTRFS_RAID_STRIPE_KEY: + ret = check_raid_stripe_extent(leaf, key, slot); + break; } if (ret) @@ -1889,6 +1938,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_LEVEL; } + if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -1950,6 +2004,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; + enum btrfs_tree_block_status ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -2005,21 +2060,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - /* - * We only want to do this if WRITTEN is set, otherwise the leaf - * may be in some intermediate state and won't appear valid. - */ - if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { - enum btrfs_tree_block_status ret; - - /* - * Check if the item size and content meet other - * criteria - */ - ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) - return ret; - } + /* Check if the item size and content meet other criteria. */ + ret = check_leaf_item(leaf, &key, slot, &prev_key); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return ret; prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -2049,6 +2093,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) int level = btrfs_header_level(node); u64 bytenr; + if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(node, 0, "invalid flag for node, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", @@ -2113,7 +2162,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * Skip dummy fs, as selftests don't create unique ebs for each dummy * root. */ - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + if (btrfs_is_testing(eb->fs_info)) return 0; /* * There are several call sites (backref walking, qgroup, and data @@ -2186,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info)) return 0; /* We have @first_key, so this @eb must have at least one item */ diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3c2a02a72f64..01669cfa6578 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,10 +6,12 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H +#include <linux/types.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct btrfs_chunk; +struct btrfs_key; /* All the extra info needed to verify the parentness of a tree block. */ struct btrfs_tree_parent_check { @@ -22,7 +24,7 @@ struct btrfs_tree_parent_check { /* * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. + * should only be utilized for backref walk related code. */ u64 transid; @@ -51,6 +53,7 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, BTRFS_TREE_BLOCK_INVALID_ITEM, BTRFS_TREE_BLOCK_INVALID_OWNER, + BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cc9a2f8a4ae3..31adea5b0b96 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -13,13 +13,11 @@ #include "tree-log.h" #include "disk-io.h" #include "locking.h" -#include "print-tree.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" @@ -140,11 +138,14 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; struct inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -153,10 +154,13 @@ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) * attempt a transaction commit, resulting in a deadlock. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget(root->fs_info->sb, objectid, root); + inode = btrfs_iget(objectid, root); memalloc_nofs_restore(nofs_flag); - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return BTRFS_I(inode); } /* @@ -320,8 +324,7 @@ struct walk_control { /* * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in - * the LOG_WALK_REPLAY_INODES stage. + * to be set every time we find a BTRFS_INODE_ITEM_KEY. */ bool ignore_cur_inode; @@ -366,8 +369,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, - eb->len); + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); if (ret) return ret; @@ -413,7 +415,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); @@ -613,21 +615,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - inode = NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -653,7 +640,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -677,23 +664,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + return 0; } - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || @@ -727,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -770,7 +753,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { - struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); @@ -784,12 +766,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret < 0) { goto out; } else if (ret == 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key->objectid, offset, 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key->objectid, offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -799,7 +784,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - root->root_key.objectid, + btrfs_root_id(root), key->objectid, offset, &ins); if (ret) goto out; @@ -818,9 +803,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; /* * Now delete all existing cums in the csum root that * cover our range. We do this because we can have an @@ -902,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -948,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -963,9 +948,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -973,10 +959,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1087,7 +1074,9 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { + if (ret < 0) { + return ret; + } else if (ret == 0) { struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; @@ -1149,7 +1138,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1160,13 +1149,13 @@ again: struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); + victim_name.len, &victim_name); if (ret) return ret; @@ -1181,18 +1170,18 @@ again: kfree(victim_name.name); return ret; } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + ret = PTR_ERR(victim_parent); + } else { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(victim_parent); kfree(victim_name.name); if (ret) return ret; @@ -1326,19 +1315,18 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1370,8 +1358,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1404,15 +1392,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1420,24 +1412,44 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (log_ref_ver) { ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); + if (ret) + goto out; /* * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + /* + * A new parent dir may have not been + * logged and not exist in the subvolume + * tree, see the comment above before + * the loop when getting the first + * parent dir. + */ + if (ret == -ENOENT) { + /* + * The next extref may refer to + * another parent dir that + * exists, so continue. + */ + ret = 0; + goto next; + } + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + if (ret) + goto out; } - if (ret) - goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1448,8 +1460,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1459,22 +1470,22 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } /* Else, ret == 1, we already have a perfect match, we're done. */ +next: ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; kfree(name.name); name.name = NULL; - if (log_ref_ver) { - iput(dir); + if (log_ref_ver && dir) { + iput(&dir->vfs_inode); dir = NULL; } } @@ -1487,8 +1498,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1497,13 +1507,14 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } -static int count_inode_extrefs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; @@ -1517,8 +1528,8 @@ static int count_inode_extrefs(struct btrfs_root *root, struct extent_buffer *leaf; while (1) { - ret = btrfs_find_one_extref(root, inode_objectid, offset, path, - &extref, &offset); + ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, + path, &extref, &offset); if (ret) break; @@ -1546,8 +1557,7 @@ static int count_inode_extrefs(struct btrfs_root *root, return nlink; } -static int count_inode_refs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; @@ -1562,7 +1572,7 @@ static int count_inode_refs(struct btrfs_root *root, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { @@ -1614,9 +1624,9 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret; u64 nlink = 0; @@ -1626,13 +1636,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = count_inode_refs(root, BTRFS_I(inode), path); + ret = count_inode_refs(BTRFS_I(inode), path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(root, BTRFS_I(inode), path); + ret = count_inode_extrefs(BTRFS_I(inode), path); if (ret < 0) goto out; @@ -1642,11 +1652,12 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; if (inode->i_nlink == 0) { if (S_ISDIR(inode->i_mode)) { @@ -1671,12 +1682,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1698,14 +1710,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(trans, root, inode); - iput(inode); + ret = fixup_inode_link_count(trans, &inode->vfs_inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1733,12 +1745,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1747,15 +1761,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1771,27 +1785,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { - iput(inode); - return -EIO; + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { + iput(&inode->vfs_inode); + return PTR_ERR(dir); } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1853,16 +1866,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -1883,9 +1896,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1900,9 +1912,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1957,11 +1968,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2118,16 +2129,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2164,9 +2175,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2174,9 +2186,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2186,7 +2197,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2310,7 +2322,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2318,14 +2330,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2387,7 +2402,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2431,23 +2446,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(eb, &key, i); + struct btrfs_inode_item *inode_item; - /* inode keys are done during the first stage */ - if (key.type == BTRFS_INODE_ITEM_KEY && - wc->stage == LOG_WALK_REPLAY_INODES) { - struct btrfs_inode_item *inode_item; - u32 mode; + btrfs_item_key_to_cpu(eb, &key, i); - inode_item = btrfs_item_ptr(eb, i, - struct btrfs_inode_item); + if (key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); /* - * If we have a tmpfile (O_TMPFILE) that got fsync'ed - * and never got linked before the fsync, skip it, as - * replaying it is pointless since it would be deleted - * later. We skip logging tmpfiles, but it's always - * possible we are replaying a log created with a kernel - * that used to log tmpfiles. + * An inode with no links is either: + * + * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never + * got linked before the fsync, skip it, as replaying + * it is pointless since it would be deleted later. + * We skip logging tmpfiles, but it's always possible + * we are replaying a log created with a kernel that + * used to log tmpfiles; + * + * 2) A non-tmpfile which got its last link deleted + * while holding an open fd on it and later got + * fsynced through that fd. We always log the + * parent inodes when inode->last_unlink_trans is + * set to the current transaction, so ignore all the + * inode items for this inode. We will delete the + * inode when processing the parent directory with + * replay_dir_deletes(). */ if (btrfs_inode_nlink(eb, inode_item) == 0) { wc->ignore_cur_inode = true; @@ -2455,8 +2477,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } else { wc->ignore_cur_inode = false; } - ret = replay_xattr_deletes(wc->trans, root, log, - path, key.objectid); + } + + /* Inode keys are done during the first stage. */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + u32 mode; + + ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); @@ -2481,30 +2509,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - root, BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -2539,9 +2565,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - if (ret && ret != -ENOENT) + if (ret) break; - ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); @@ -2562,14 +2587,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* * Correctly adjust the reserved bytes occupied by a log tree extent buffer */ -static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { btrfs_err(fs_info, "unable to find block group for %llu", start); - return; + return -ENOENT; } spin_lock(&cache->space_info->lock); @@ -2580,28 +2605,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) spin_unlock(&cache->space_info->lock); btrfs_put_block_group(cache); + + return 0; } static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { - int ret; - btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) { - ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); - if (ret) - return ret; - btrfs_redirty_list_add(trans->transaction, eb); - } else { - unaccount_log_buffer(eb->fs_info, eb->start); - } + if (trans) + return btrfs_pin_reserved_extent(trans, eb); - return 0; + return unaccount_log_buffer(eb->fs_info, eb->start); } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -2842,6 +2861,52 @@ static void wait_for_writer(struct btrfs_root *root) finish_wait(&root->log_writer_wait, &wait); } +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; + ctx->scratch_eb = NULL; +} + +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = ctx->inode; + + if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return; + + /* + * Don't care about allocation failure. This is just for optimization, + * if we fail to allocate here, we will try again later if needed. + */ + ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); +} + +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + btrfs_assert_inode_locked(ctx->inode); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + + static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { @@ -2867,10 +2932,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, } /* - * btrfs_sync_log does sends a given tree log down to the disk and - * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. + * Sends a given tree log down to the disk and updates the super blocks to + * record it. When this call is done, you know that any inodes previously + * logged are safely on disk only if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks @@ -2980,7 +3044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); - root->log_transid++; + btrfs_set_root_log_transid(root, root->log_transid + 1); log->log_transid = root->log_transid; root->log_start_pid = 0; /* @@ -3018,15 +3082,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = update_log_root(trans, log, &new_root_item); if (ret) { - if (!list_empty(&root_log_ctx.list)) - list_del_init(&root_log_ctx.list); - + list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) btrfs_err(fs_info, "failed to update log for root %llu ret %d", - root->root_key.objectid, ret); + btrfs_root_id(root), ret); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); goto out; @@ -3040,7 +3102,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); @@ -3155,8 +3216,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(root->last_log_commit <= log_transid); - root->last_log_commit = log_transid; + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3230,8 +3291,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->dirty_log_pages); extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); @@ -3646,6 +3706,30 @@ out: return ret; } +static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) +{ + const int slot = path->slots[0]; + + if (ctx->scratch_eb) { + copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); + } else { + ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!ctx->scratch_eb) + return -ENOMEM; + } + + btrfs_release_path(path); + path->nodes[0] = ctx->scratch_eb; + path->slots[0] = slot; + /* + * Add extra ref to scratch eb so that it is not freed when callers + * release the path, so we can reuse it later if needed. + */ + atomic_inc(&ctx->scratch_eb->refs); + + return 0; +} + static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, @@ -3660,23 +3744,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, bool last_found = false; int batch_start = 0; int batch_size = 0; - int i; + int ret; /* * We need to clone the leaf, release the read lock on it, and use the * clone before modifying the log tree. See the comment at copy_items() * about why we need to do this. */ - src = btrfs_clone_extent_buffer(path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(path, ctx); + if (ret < 0) + return ret; - i = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = src; - path->slots[0] = i; + src = path->nodes[0]; - for (; i < nritems; i++) { + for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4157,19 +4238,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4194,8 +4278,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; + struct btrfs_key key; int ret; + btrfs_get_inode_key(inode, &key); /* * If we are doing a fast fsync and the inode was logged before in the * current transaction, then we know the inode was previously logged and @@ -4207,7 +4293,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, * already exists can also result in unnecessarily splitting a leaf. */ if (!inode_item_dropped && inode->logged_trans == trans->transid) { - ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); ASSERT(ret <= 0); if (ret > 0) ret = -ENOENT; @@ -4221,7 +4307,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime * flags and set ->logged_trans to 0. */ - ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*inode_item)); ASSERT(ret != -EEXIST); } @@ -4286,17 +4372,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, - u64 logged_isize) + u64 logged_isize, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct extent_buffer *src; - int ret = 0; + int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; - int i; int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4329,14 +4414,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * while the other is holding the delayed node's mutex and wants to * write lock the same subvolume leaf for flushing delayed items. */ - src = btrfs_clone_extent_buffer(src_path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(src_path, ctx); + if (ret < 0) + return ret; - i = src_path->slots[0]; - btrfs_release_path(src_path); - src_path->nodes[0] = src; - src_path->slots[0] = i; + src = src_path->nodes[0]; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4351,7 +4433,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = 0; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; @@ -4426,9 +4508,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr += extent_offset; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { if (!ret) @@ -4458,7 +4541,7 @@ add_to_batch: goto out; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; const int dst_slot = dst_path->slots[0] + dst_index; struct btrfs_key key; @@ -4538,16 +4621,17 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_extent *ordered; struct btrfs_root *csum_root; + u64 block_start; u64 csum_offset; u64 csum_len; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; + u64 mod_start = em->start; + u64 mod_len = em->len; LIST_HEAD(ordered_sums); int ret = 0; if (inode->flags & BTRFS_INODE_NODATASUM || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || - em->block_start == EXTENT_MAP_HOLE) + (em->flags & EXTENT_FLAG_PREALLOC) || + em->disk_bytenr == EXTENT_MAP_HOLE) return 0; list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { @@ -4609,21 +4693,23 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (em->compress_type) { + if (extent_map_is_compressed(em)) { csum_offset = 0; - csum_len = max(em->block_len, em->orig_block_len); + csum_len = em->disk_num_bytes; } else { csum_offset = mod_start - em->start; csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ - csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); - if (ret) + block_start = extent_map_block_start(em); + csum_root = btrfs_csum_root(trans->fs_info, block_start); + ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, + block_start + csum_offset + csum_len - 1, + &ordered_sums, false); + if (ret < 0) return ret; + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, @@ -4649,30 +4735,32 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; struct btrfs_key key; - u64 extent_offset = em->start - em->orig_start; + enum btrfs_compression_type compress_type; + u64 extent_offset = em->offset; + u64 block_start = extent_map_block_start(em); u64 block_len; int ret; btrfs_set_stack_file_extent_generation(&fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); else btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + block_len = em->disk_num_bytes; + compress_type = extent_map_compression(em); + if (compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - - extent_offset); + } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); } btrfs_set_stack_file_extent_offset(&fi, extent_offset); btrfs_set_stack_file_extent_num_bytes(&fi, em->len); btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); - btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + btrfs_set_stack_file_extent_compression(&fi, compress_type); ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) @@ -4729,7 +4817,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_key key; @@ -4795,7 +4884,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) goto out; ins_nr = 0; @@ -4850,7 +4939,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4890,13 +4979,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, continue; /* We log prealloc extents beyond eof later. */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + if ((em->flags & EXTENT_FLAG_PREALLOC) && em->start >= i_size_read(&inode->vfs_inode)) continue; /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags |= EXTENT_FLAG_LOGGING; list_add_tail(&em->list, &extents); num++; } @@ -4913,7 +5002,7 @@ process: * private list. */ if (ret) { - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); continue; } @@ -4922,14 +5011,14 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); if (!ret) - ret = btrfs_log_prealloc_extents(trans, inode, path); + ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); if (ret) return ret; @@ -4945,12 +5034,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -5010,7 +5099,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; int ret; @@ -5039,7 +5129,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5065,7 +5155,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; } @@ -5419,7 +5509,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5435,7 +5524,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5462,17 +5551,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5514,14 +5602,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5599,7 +5686,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5690,12 +5777,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5731,7 +5818,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5767,9 +5854,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5785,8 +5871,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5797,8 +5883,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -5866,7 +5952,7 @@ again: if (ret < 0) { return ret; } else if (ret > 0 && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + other_ino != btrfs_ino(ctx->inode)) { if (ins_nr > 0) { ins_nr++; } else { @@ -5875,7 +5961,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, - inode_only, logged_isize); + inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5894,7 +5980,7 @@ again: goto next_slot; ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5911,7 +5997,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 1; @@ -5926,7 +6012,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, - logged_isize); + logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5951,7 +6037,7 @@ next_key: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret) return ret; } @@ -5962,7 +6048,7 @@ next_key: * lock the same leaf with btrfs_log_prealloc_extents() below. */ btrfs_release_path(path); - ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); } return ret; @@ -6290,7 +6376,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6306,8 +6392,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6315,12 +6401,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6553,7 +6639,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; xattrs_logged = true; @@ -6580,7 +6666,7 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; btrfs_release_path(path); @@ -6728,7 +6814,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6777,18 +6863,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6813,7 +6897,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6828,11 +6912,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7013,6 +7096,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } /* + * If we're logging an inode from a subvolume created in the current + * transaction we must force a commit since the root is not persisted. + */ + if (btrfs_root_generation(&root->root_item) == trans->transid) { + ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + + /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they * will never be accessible). @@ -7222,9 +7314,7 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, - log->node->start, - log->node->len); + ret = btrfs_pin_extent_for_log_replay(trans, log->node); btrfs_put_root(log); if (!ret) @@ -7235,11 +7325,14 @@ again: wc.replay_dest->log_root = log; ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) + if (ret) { /* The loop needs to continue due to the root refs */ btrfs_abort_transaction(trans, ret); - else + } else { ret = walk_log_tree(trans, log, &wc); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, @@ -7395,6 +7488,26 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, } /* + * Call this when creating a subvolume in a directory. + * Because we don't commit a transaction when creating a subvolume, we can't + * allow the directory pointing to the subvolume to be logged with an entry that + * points to an unpersisted root if we are still in the transaction used to + * create the subvolume, so make any attempt to log the directory to result in a + * full log sync. + * Also we don't need to worry with renames, since btrfs_rename() marks the log + * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. + */ +void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, + struct btrfs_inode *dir) +{ + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); +} + +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. @@ -7526,8 +7639,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, goto out; } - btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; + btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7536,6 +7650,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a550a8a375cd..dc313e6bb2fa 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,10 +6,18 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include <linux/list.h> +#include <linux/fs.h> #include "messages.h" #include "ctree.h" #include "transaction.h" +struct inode; +struct dentry; +struct btrfs_ordered_extent; +struct btrfs_root; +struct btrfs_trans_handle; + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -29,44 +37,27 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - struct inode *inode; + struct btrfs_inode *inode; struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; + /* + * Used for fsyncs that need to copy items from the subvolume tree to + * the log tree (full sync flag set or copy everything flag set) to + * avoid allocating a temporary extent buffer while holding a lock on + * an extent buffer of the subvolume tree and under the log transaction. + * Also helps to avoid allocating and freeing a temporary extent buffer + * in case we need to process multiple leaves from the subvolume tree. + */ + struct extent_buffer *scratch_eb; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, - struct inode *inode) -{ - ctx->log_ret = 0; - ctx->log_transid = 0; - ctx->log_new_dentries = false; - ctx->logging_new_name = false; - ctx->logging_new_delayed_dentries = false; - ctx->logged_before = false; - ctx->inode = inode; - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->ordered_extents); - INIT_LIST_HEAD(&ctx->conflict_inodes); - ctx->num_conflict_inodes = 0; - ctx->logging_conflict_inodes = false; -} - -static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_extent *tmp; - - ASSERT(inode_is_locked(ctx->inode)); - - list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode); +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx); +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx); static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { @@ -103,6 +94,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); +void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, + struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct dentry *old_dentry, struct btrfs_inode *old_dir, u64 old_dir_index, struct dentry *parent); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3df6153d5d5a..b382a4c443d4 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -44,7 +44,7 @@ struct tree_mod_elem { /* * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } @@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,8 +187,8 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, } /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -199,7 +198,7 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -222,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -259,7 +258,7 @@ out_unlock: return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -279,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -367,9 +366,9 @@ free_tms: return ret; } -static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { int i, j; int ret; @@ -536,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) @@ -1005,7 +1004,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); check.level = level; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 94f10afeee97..6308c577a4a4 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -3,7 +3,13 @@ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -#include "ctree.h" +#include <linux/list.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_root; +struct btrfs_seq_list; /* Represents a tree mod log user. */ struct btrfs_seq_list { @@ -31,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, @@ -41,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 33606025513d..fc59b57257d6 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -7,7 +7,6 @@ #include <linux/slab.h> #include "messages.h" #include "ulist.h" -#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 @@ -51,6 +50,7 @@ void ulist_init(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; ulist->nnodes = 0; + ulist->prealloc = NULL; } /* @@ -69,6 +69,8 @@ void ulist_release(struct ulist *ulist) list_for_each_entry_safe(node, next, &ulist->nodes, list) { kfree(node); } + kfree(ulist->prealloc); + ulist->prealloc = NULL; ulist->root = RB_ROOT; INIT_LIST_HEAD(&ulist->nodes); } @@ -106,6 +108,12 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } +void ulist_prealloc(struct ulist *ulist, gfp_t gfp_mask) +{ + if (!ulist->prealloc) + ulist->prealloc = kzalloc(sizeof(*ulist->prealloc), gfp_mask); +} + /* * Free dynamically allocated ulist. * @@ -207,9 +215,15 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, *old_aux = node->aux; return 0; } - node = kmalloc(sizeof(*node), gfp_mask); - if (!node) - return -ENOMEM; + + if (ulist->prealloc) { + node = ulist->prealloc; + ulist->prealloc = NULL; + } else { + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; + } node->val = val; node->aux = aux; @@ -223,7 +237,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, } /* - * ulist_del - delete one node from ulist + * Delete one node from ulist. + * * @ulist: ulist to remove node from * @val: value to delete * @aux: aux to delete diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index b2cef187ea8e..c62a372f1462 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -7,6 +7,7 @@ #ifndef BTRFS_ULIST_H #define BTRFS_ULIST_H +#include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> @@ -40,12 +41,14 @@ struct ulist { struct list_head nodes; struct rb_root root; + struct ulist_node *prealloc; }; void ulist_init(struct ulist *ulist); void ulist_release(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); +void ulist_prealloc(struct ulist *ulist, gfp_t mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 5be74f9e47eb..aca2861f2187 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,18 +3,19 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include <linux/kthread.h> #include <linux/uuid.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" -static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { key->type = type; key->objectid = get_unaligned_le64(uuid); @@ -22,7 +23,7 @@ static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) } /* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ -static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, u8 type, u64 subid) { int ret; @@ -82,7 +83,7 @@ out: return ret; } -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid_cpu) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -114,7 +115,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); - if (ret >= 0) { + if (ret == 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; @@ -146,7 +147,7 @@ out: return ret; } -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -257,7 +258,7 @@ out: * < 0 if an error occurred */ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subvolid) + const u8 *uuid, u8 type, u64 subvolid) { int ret = 0; struct btrfs_root *subvol_root; @@ -391,3 +392,180 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index 5350c87fe2ca..c60ad20325cc 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -3,10 +3,17 @@ #ifndef BTRFS_UUID_TREE_H #define BTRFS_UUID_TREE_H -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 744f4f4d4c68..e97ad824ae16 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -14,7 +14,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" -#include "disk-io.h" #include "locking.h" #include "fs.h" #include "accessors.h" @@ -285,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -295,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -315,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -372,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -461,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -487,7 +486,7 @@ static int rollback_verity(struct btrfs_inode *inode) } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -554,7 +553,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, } inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; ret = del_orphan(trans, inode); @@ -586,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -634,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; @@ -763,7 +762,7 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h index 91c10f7d0a46..d696659e43e4 100644 --- a/fs/btrfs/verity.h +++ b/fs/btrfs/verity.h @@ -3,8 +3,13 @@ #ifndef BTRFS_VERITY_H #define BTRFS_VERITY_H +struct inode; +struct btrfs_inode; + #ifdef CONFIG_FS_VERITY +#include <linux/fsverity.h> + extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); @@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) #else +#include <linux/errno.h> + static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b9a0b26d08e1..58e0cac5779d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,10 +14,8 @@ #include <linux/namei.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" @@ -35,11 +33,23 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) +struct btrfs_io_geometry { + u32 stripe_index; + u32 stripe_nr; + int mirror_num; + int num_stripes; + u64 stripe_offset; + u64 raid56_full_stripe_start; + int max_errors; + enum btrfs_map_op op; +}; + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -357,21 +367,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) } /* - * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the UUID to fs_devices::fsid - * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid + * Allocate new btrfs_fs_devices structure identified by a fsid. + * + * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to + * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, - const u8 *metadata_fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; - ASSERT(fsid || !metadata_fsid); - fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -385,8 +393,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - memcpy(fs_devs->metadata_uuid, - metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; @@ -457,91 +464,46 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } -/* - * First check if the metadata_uuid is different from the fsid in the given - * fs_devices. Then check if the given fsid is the same as the metadata_uuid - * in the fs_devices. If it is, return true; otherwise, return false. - */ -static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, - const u8 *fsid) -{ - return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; -} - -static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( - struct btrfs_super_block *disk_super) -{ - - struct btrfs_fs_devices *fs_devices; - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, - fs_devices->fsid)) - return fs_devices; - } - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, disk_super->metadata_uuid); -} - - static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct block_device **bdev, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { + struct block_device *bdev; int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev)) { - ret = PTR_ERR(*bdev); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } + bdev = file_bdev(*bdev_file); if (flush) - sync_blockdev(*bdev); - ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); - if (ret) { - blkdev_put(*bdev, holder); - goto error; + sync_blockdev(bdev); + if (holder) { + ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); + if (ret) { + fput(*bdev_file); + goto error; + } } - invalidate_bdev(*bdev); - *disk_super = btrfs_read_dev_super(*bdev); + invalidate_bdev(bdev); + *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, holder); + fput(*bdev_file); goto error; } return 0; error: - *bdev = NULL; + *disk_super = NULL; + *bdev_file = NULL; return ret; } @@ -562,13 +524,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; - int ret = 0; + int ret; + bool freed = false; lockdep_assert_held(&uuid_mutex); - if (devt) - ret = -ENOENT; - + /* Return good status if there is no instance of devt. */ + ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); @@ -579,8 +541,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device if (devt && devt != device->devt) continue; if (fs_devices->opened) { - /* for an already deleted device return 0 */ - if (devt && ret != 0) + if (devt) ret = -EBUSY; break; } @@ -590,7 +551,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device list_del(&device->dev_list); btrfs_free_device(device); - ret = 0; + freed = true; } mutex_unlock(&fs_devices->device_list_mutex); @@ -601,9 +562,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device } } + /* If there is at least one freed device return 0. */ + if (freed) + return 0; + return ret; } +static struct btrfs_fs_devices *find_fsid_by_device( + struct btrfs_super_block *disk_super, + dev_t devt, bool *same_fsid_diff_dev) +{ + struct btrfs_fs_devices *fsid_fs_devices; + struct btrfs_fs_devices *devt_fs_devices; + const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool found_by_devt = false; + + /* Find the fs_device by the usual method, if found use it. */ + fsid_fs_devices = find_fsid(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); + + /* The temp_fsid feature is supported only with single device filesystem. */ + if (btrfs_super_num_devices(disk_super) != 1) + return fsid_fs_devices; + + /* + * A seed device is an integral component of the sprout device, which + * functions as a multi-device filesystem. So, temp-fsid feature is + * not supported. + */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) + return fsid_fs_devices; + + /* Try to find a fs_devices by matching devt. */ + list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { + if (device->devt == devt) { + found_by_devt = true; + break; + } + } + if (found_by_devt) + break; + } + + if (found_by_devt) { + /* Existing device. */ + if (fsid_fs_devices == NULL) { + if (devt_fs_devices->opened == 0) { + /* Stale device. */ + return NULL; + } else { + /* temp_fsid is mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* Regular or temp_fsid device mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* New device. */ + if (fsid_fs_devices == NULL) { + return NULL; + } else { + /* sb::fsid is already used create a new temp_fsid. */ + *same_fsid_diff_dev = true; + return NULL; + } + } + + /* Not reached. */ +} + /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the @@ -613,7 +646,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct block_device *bdev; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -624,7 +657,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &disk_super); + &bdev_file, &disk_super); if (ret) return ret; @@ -648,21 +681,31 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; - device->bdev = bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->holder = holder; + + if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -676,12 +719,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, holder); + fput(bdev_file); return -EINVAL; } -u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) { bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); @@ -726,84 +769,6 @@ out: } /* - * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. Such - * disk can belong to an fs which has its FSID changed or to one which doesn't. - * Handle both cases here. - */ -static struct btrfs_fs_devices *find_fsid_inprogress( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, NULL); -} - -static struct btrfs_fs_devices *find_fsid_changed( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but currently device didn't - * observe it. Meaning our fsid will be different than theirs. We need - * to handle two subcases : - * 1 - The fs still continues to have different METADATA/FSID uuids. - * 2 - The fs is switched back to its original FSID (METADATA/FSID - * are equal). - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - /* Changed UUIDs */ - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && - memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) - return fs_devices; - - /* Unchanged UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } - - return NULL; -} - -static struct btrfs_fs_devices *find_fsid_reverted_metadata( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handle the case where the scanned device is part of an fs whose last - * metadata UUID change reverted it to the original FSID. At the same - * time fs_devices was first created by another constituent device - * which didn't fully observe the operation. This results in an - * btrfs_fs_devices created with metadata/fsid different AND - * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the - * fs_devices equal to the FSID of the disk. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return NULL; -} -/* * Add new device to list of registered devices * * Returns: @@ -821,10 +786,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; + bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & - BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + btrfs_err(NULL, +"device %s has incomplete metadata_uuid change, please use btrfstune to complete", + path); + return ERR_PTR(-EAGAIN); + } error = lookup_bdev(path, &path_devt); if (error) { @@ -833,27 +804,24 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(error); } - if (fsid_change_in_progress) { - if (!has_metadata_uuid) - fs_devices = find_fsid_inprogress(disk_super); - else - fs_devices = find_fsid_changed(disk_super); - } else if (has_metadata_uuid) { - fs_devices = find_fsid_with_metadata_uuid(disk_super); - } else { - fs_devices = find_fsid_reverted_metadata(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } - + fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid, - has_metadata_uuid ? disk_super->metadata_uuid : NULL); + fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - fs_devices->fsid_change = fsid_change_in_progress; + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + if (same_fsid_diff_dev) { + generate_random_uuid(fs_devices->fsid); + fs_devices->temp_fsid = true; + pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid); + } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); @@ -868,18 +836,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); - /* - * If this disk has been pulled into an fs devices created by - * a device which had the CHANGING_FSID_V2 flag then replace the - * metadata_uuid/fsid values of the fs_devices. - */ - if (fs_devices->fsid_change && - found_transid > fs_devices->latest_generation) { + if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); memcpy(fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); - fs_devices->fsid_change = false; } } @@ -888,8 +849,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, -"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", - path, fs_devices->fsid, current->comm, +"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); @@ -915,13 +877,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (disk_super->label[0]) pr_info( - "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( - "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { @@ -1033,7 +997,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) lockdep_assert_held(&uuid_mutex); - fs_devices = alloc_fs_devices(orig->fsid, NULL); + fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -1104,9 +1068,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev) { - blkdev_put(device->bdev, device->holder); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1151,7 +1116,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->holder); + fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1176,6 +1141,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; + device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); @@ -1363,7 +1329,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return ERR_PTR(-EINVAL); /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); + page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); if (IS_ERR(page)) return ERR_CAST(page); @@ -1396,30 +1362,70 @@ int btrfs_forget_devices(dev_t devt) return ret; } +static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, + const char *path, dev_t devt, + bool mount_arg_dev) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Do not skip device registration for mounted devices with matching + * maj:min but different paths. Booting without initrd relies on + * /dev/root initially, later replaced with the actual root device. + * A successful scan ensures grub2-probe selects the correct device. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + + if (!fs_devices->opened) { + mutex_unlock(&fs_devices->device_list_mutex); + continue; + } + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->bdev && (device->bdev->bd_dev == devt) && + strcmp(device->name->str, path) != 0) { + mutex_unlock(&fs_devices->device_list_mutex); + + /* Do not skip registration. */ + return false; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + } + + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) + return true; + + return false; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock - * is read via pagecache + * is read via pagecache. + * + * With @mount_arg_dev it's a scan during mount time that will always register + * the device or return an error. Multi-device and seeding devices are registered + * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct block_device *bdev; - u64 bytenr, bytenr_orig; + struct file *bdev_file; + u64 bytenr; + dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); /* - * we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - - /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, * resulting in failure. @@ -1429,31 +1435,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + /* + * We would like to check all the super blocks, but doing so would + * allow a mount to succeed after a mkfs from a different filesystem. + * Currently, recovery from a bad primary btrfs superblock is done + * using the userspace command 'btrfs check --super'. + */ + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, + btrfs_sb_offset(0)); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } + devt = file_bdev(bdev_file)->bd_dev; + if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { + pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + path, MAJOR(devt), MINOR(devt)); + + btrfs_free_stale_devices(devt, NULL); + + device = NULL; + goto free_disk_super; + } + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); +free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, NULL); + fput(bdev_file); return device; } @@ -1825,19 +1849,18 @@ out: static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree; - struct extent_map *em; struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - n = rb_last(&em_tree->map.rb_root); + read_lock(&fs_info->mapping_tree_lock); + n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { - em = rb_entry(n, struct extent_map, rb_node); - ret = em->start + em->len; + struct btrfs_chunk_map *map; + + map = rb_entry(n, struct btrfs_chunk_map, rb_node); + ret = map->start + map->chunk_len; } - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } @@ -2105,11 +2128,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, copy_num, ret); } -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; + struct block_device *bdev = device->bdev; if (!bdev) return; @@ -2125,12 +2147,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2239,7 +2261,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2255,21 +2277,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and blkdev_put() will pull in the ->open_mutex on the - * block device and it's dependencies. Instead just flush the device - * and let the caller do the final blkdev_put. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - btrfs_scratch_superblocks(fs_info, device->bdev, - device->name->str); + btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } - *bdev = device->bdev; - *holder = device->holder; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); @@ -2375,8 +2395,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, - tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2406,7 +2425,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct block_device *bdev; + struct file *bdev_file; int ret; if (!path || !path[0]) @@ -2424,7 +2443,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2437,7 +2456,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, NULL); + fput(bdev_file); return 0; } @@ -2494,7 +2513,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ - seed_devices = alloc_fs_devices(NULL, NULL); + seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; @@ -2657,7 +2676,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct block_device *bdev; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2670,12 +2689,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); - if (!btrfs_check_device_zone_type(fs_info, bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@ -2687,11 +2706,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev); + sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2707,7 +2726,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev = bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2728,14 +2748,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = - round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); + round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { /* GFP_KERNEL allocation must not be under device_list_mutex */ @@ -2767,7 +2786,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2887,7 +2906,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, fs_info->bdev_holder); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -2968,6 +2987,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; + atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); @@ -3004,16 +3024,19 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_handle_fs_error(fs_info, -ENOENT, - "Failed lookup while freeing chunk."); - ret = -ENOENT; + btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", + chunk_offset); + btrfs_abort_transaction(trans, -ENOENT); + ret = -EUCLEAN; goto out; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) - btrfs_handle_fs_error(fs_info, ret, - "Failed to delete chunk item."); + if (ret < 0) { + btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); + btrfs_abort_transaction(trans, ret); + goto out; + } out: btrfs_free_path(path); return ret; @@ -3065,45 +3088,118 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev; + struct btrfs_chunk_map *map; + struct btrfs_chunk_map *prev_map = NULL; + + while (node) { + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + prev = node; + prev_map = map; + + if (logical < map->start) { + node = node->rb_left; + } else if (logical >= map->start + map->chunk_len) { + node = node->rb_right; + } else { + refcount_inc(&map->refs); + return map; + } + } + + if (!prev) + return NULL; + + orig_prev = prev; + while (prev && logical >= prev_map->start + prev_map->chunk_len) { + prev = rb_next(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + + if (!prev) { + prev = orig_prev; + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + while (prev && logical < prev_map->start) { + prev = rb_prev(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + } + + if (prev) { + u64 end = logical + length; + + /* + * Caller can pass a U64_MAX length when it wants to get any + * chunk starting at an offset of 'logical' or higher, so deal + * with underflow by resetting the end offset to U64_MAX. + */ + if (end < logical) + end = U64_MAX; + + if (end > prev_map->start && + logical < prev_map->start + prev_map->chunk_len) { + refcount_inc(&prev_map->refs); + return prev_map; + } + } + + return NULL; +} + +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_chunk_map *map; + + read_lock(&fs_info->mapping_tree_lock); + map = btrfs_find_chunk_map_nolock(fs_info, logical, length); + read_unlock(&fs_info->mapping_tree_lock); + + return map; +} + /* - * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * Find the mapping containing the given logical extent. + * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * * Return: Chunk mapping or ERR_PTR. */ -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, length); - if (!em) { + if (unlikely(!map)) { btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len <= logical) { + if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", - logical, logical + length, em->start, em->start + em->len); - free_extent_map(em); + logical, logical + length, map->start, + map->start + map->chunk_len); + btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } - /* callers are responsible for dropping em's ref. */ - return em; + /* Callers are responsible for dropping the reference. */ + return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, - struct map_lookup *map, u64 chunk_offset) + struct btrfs_chunk_map *map, u64 chunk_offset) { int i; @@ -3128,23 +3224,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) { + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - return PTR_ERR(em); + return PTR_ERR(map); } - map = em->map_lookup; /* * First delete the device extent items from the devices btree. @@ -3174,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } @@ -3247,7 +3347,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) goto out; } - trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); + trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); @@ -3266,7 +3366,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); - ret = btrfs_remove_block_group(trans, chunk_offset, em); + ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3278,7 +3378,7 @@ out: trans->removing_chunk = false; } /* once for us */ - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -3489,6 +3589,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3633,7 +3771,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; - BUG_ON(!fs_info->balance_ctl); + ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; @@ -4687,183 +4825,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -4889,6 +4850,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; + u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -4914,7 +4876,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; - atomic64_sub(diff, &fs_info->free_chunk_space); + + /* + * The new free_chunk_space is new_size - used, so we have to + * subtract the delta of the old free_chunk_space which included + * old_size - used. If used > new_size then just subtract this + * entire device's free space. + */ + if (device->bytes_used < new_size) + free_diff = (old_size - device->bytes_used) - + (new_size - device->bytes_used); + else + free_diff = old_size - device->bytes_used; + atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* @@ -5049,9 +5023,10 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; - atomic64_add(diff, &fs_info->free_chunk_space); + atomic64_add(free_diff, &fs_info->free_chunk_space); + } mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -5422,69 +5397,140 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } +static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); + } +} + +static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); + } +} + +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + write_lock(&fs_info->mapping_tree_lock); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + write_unlock(&fs_info->mapping_tree_lock); + + /* Once for the tree reference. */ + btrfs_free_chunk_map(map); +} + +EXPORT_FOR_TESTS +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + bool leftmost = true; + + write_lock(&fs_info->mapping_tree_lock); + p = &fs_info->mapping_tree.rb_root.rb_node; + while (*p) { + struct btrfs_chunk_map *entry; + + parent = *p; + entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); + + if (map->start < entry->start) { + p = &(*p)->rb_left; + } else if (map->start > entry->start) { + p = &(*p)->rb_right; + leftmost = false; + } else { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; + } + } + rb_link_node(&map->rb_node, parent, p); + rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); + chunk_map_device_set_bits(map, CHUNK_ALLOCATED); + chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + write_unlock(&fs_info->mapping_tree_lock); + + return 0; +} + +EXPORT_FOR_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) +{ + struct btrfs_chunk_map *map; + + map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); + if (!map) + return NULL; + + refcount_set(&map->refs, 1); + RB_CLEAR_NODE(&map->rb_node); + + return map; +} + static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; + struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; - struct extent_map *em; u64 start = ctl->start; u64 type = ctl->type; int ret; - int i; - int j; - map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); + + map->start = start; + map->chunk_len = ctl->chunk_size; + map->stripe_size = ctl->stripe_size; + map->type = type; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; - for (i = 0; i < ctl->ndevs; ++i) { - for (j = 0; j < ctl->dev_stripes; ++j) { + for (int i = 0; i < ctl->ndevs; i++) { + for (int j = 0; j < ctl->dev_stripes; j++) { int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + j * ctl->stripe_size; } } - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = ctl->sub_stripes; trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); - em = alloc_extent_map(); - if (!em) { - kfree(map); - return ERR_PTR(-ENOMEM); - } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = start; - em->len = ctl->chunk_size; - em->block_start = 0; - em->block_len = em->len; - em->orig_block_len = ctl->stripe_size; - - em_tree = &info->mapping_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_chunk_map(info, map); if (ret) { - write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } - write_unlock(&em_tree->lock); block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); - if (IS_ERR(block_group)) - goto error_del_extent; + if (IS_ERR(block_group)) { + btrfs_remove_chunk_map(info, map); + return block_group; + } - for (i = 0; i < map->num_stripes; i++) { + for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, @@ -5497,23 +5543,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); - free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); return block_group; - -error_del_extent: - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* One for our allocation */ - free_extent_map(em); - /* One for the tree reference */ - free_extent_map(em); - - return block_group; } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, @@ -5589,8 +5622,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; size_t item_size; int i; int ret; @@ -5619,14 +5651,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, */ lockdep_assert_held(&fs_info->chunk_mutex); - em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(map)) { + ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } - map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); @@ -5683,7 +5714,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, out: kfree(chunk); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5728,7 +5759,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) return 0; } -static inline int btrfs_chunk_max_errors(struct map_lookup *map) +static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); @@ -5737,17 +5768,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) return false; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { @@ -5768,38 +5797,57 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } -void btrfs_mapping_tree_free(struct extent_map_tree *tree) +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { - struct extent_map *em; + write_lock(&fs_info->mapping_tree_lock); + while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { + struct btrfs_chunk_map *map; + struct rb_node *node; - while (1) { - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, 0, (u64)-1); - if (em) - remove_extent_mapping(tree, em); - write_unlock(&tree->lock); - if (!em) - break; - /* once for us */ - free_extent_map(em); - /* once for the tree */ - free_extent_map(em); + node = rb_first_cached(&fs_info->mapping_tree); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + /* Once for the tree ref. */ + btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } + write_unlock(&fs_info->mapping_tree_lock); +} + +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; - enum btrfs_raid_types index; - int ret = 1; + struct btrfs_chunk_map *map; + int ret; - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do @@ -5808,72 +5856,53 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - map = em->map_lookup; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; - free_extent_map(em); + ret = btrfs_chunk_map_num_copies(map); + btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if (!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); - free_extent_map(em); + btrfs_free_chunk_map(map); } return len; } int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int ret = 0; if (!btrfs_fs_incompat(fs_info, RAID56)) return 0; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if(!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; - free_extent_map(em); + btrfs_free_chunk_map(map); } return ret; } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, + struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { + const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; @@ -5888,13 +5917,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - switch (fs_info->fs_devices->read_policy) { + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ - btrfs_warn_rl(fs_info, - "unknown read_policy type %u, reset to pid", - fs_info->fs_devices->read_policy); - fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", + policy); + WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); @@ -5931,6 +5959,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -5950,6 +5979,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; + bioc->logical = logical; return bioc; } @@ -5976,8 +6006,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; @@ -5995,11 +6024,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int ret; int i; - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -6007,8 +6034,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, goto out_free_map; } - offset = logical - em->start; - length = min_t(u64, em->start + em->len - logical, length); + offset = logical - map->start; + length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* @@ -6105,10 +6132,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, } } - free_extent_map(em); + btrfs_free_chunk_map(map); return stripes; out_free_map: - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } @@ -6129,20 +6156,19 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) return ret; } -static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_io_context *bioc, +static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, - int *num_stripes_ret, int *max_errors_ret) + struct btrfs_io_geometry *io_geom) { u64 srcdev_devid = dev_replace->srcdev->devid; /* * At this stage, num_stripes is still the real number of stripes, * excluding the duplicated stripes. */ - int num_stripes = *num_stripes_ret; + int num_stripes = io_geom->num_stripes; + int max_errors = io_geom->max_errors; int nr_extra_stripes = 0; - int max_errors = *max_errors_ret; int i; /* @@ -6183,7 +6209,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, * replace. * If we have 2 extra stripes, only choose the one with smaller physical. */ - if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; @@ -6201,22 +6227,21 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, } } - *num_stripes_ret = num_stripes + nr_extra_stripes; - *max_errors_ret = max_errors + nr_extra_stripes; + io_geom->num_stripes = num_stripes + nr_extra_stripes; + io_geom->max_errors = max_errors + nr_extra_stripes; bioc->replace_nr_stripes = nr_extra_stripes; } -static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u32 *stripe_nr, u64 *stripe_offset, - u64 *full_stripe_start) +static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, + struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; - *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(*stripe_offset < U32_MAX); + io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6231,18 +6256,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ - *full_stripe_start = - btrfs_stripe_nr_to_offset( - rounddown(*stripe_nr, nr_data_stripes(map))); + io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( + rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(*full_stripe_start + full_stripe_len > offset); - ASSERT(*full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ - if (op == BTRFS_MAP_WRITE) - return full_stripe_len - (offset - *full_stripe_start); + if (io_geom->op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* @@ -6250,16 +6274,178 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return BTRFS_STRIPE_LEN - *stripe_offset; + return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } -static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u32 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, + u64 *length, struct btrfs_io_stripe *dst, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) { - dst->dev = map->stripes[stripe_index].dev; - dst->physical = map->stripes[stripe_index].physical + - stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + dst->dev = map->stripes[io_geom->stripe_index].dev; + + if (io_geom->op == BTRFS_MAP_READ && + btrfs_need_stripe_tree_update(fs_info, map->type)) + return btrfs_get_raid_extent_offset(fs_info, logical, length, + map->type, + io_geom->stripe_index, dst); + + dst->physical = map->stripes[io_geom->stripe_index].physical + + io_geom->stripe_offset + + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); + return 0; +} + +static bool is_single_device_io(struct btrfs_fs_info *fs_info, + const struct btrfs_io_stripe *smap, + const struct btrfs_chunk_map *map, + int num_alloc_stripes, + enum btrfs_map_op op, int mirror_num) +{ + if (!smap) + return false; + + if (num_alloc_stripes != 1) + return false; + + if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + return false; + + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + return false; + + return true; +} + +static void map_blocks_raid0(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + if (io_geom->op == BTRFS_MAP_READ) + io_geom->mirror_num = 1; +} + +static void map_blocks_raid1(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->stripe_index = find_live_mirror(fs_info, map, 0, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index + 1; +} + +static void map_blocks_dup(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->mirror_num = 1; +} + +static void map_blocks_raid10(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + u32 factor = map->num_stripes / map->sub_stripes; + int old_stripe_index; + + io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; + io_geom->stripe_nr /= factor; + + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->sub_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index += io_geom->mirror_num - 1; + return; + } + + old_stripe_index = io_geom->stripe_index; + io_geom->stripe_index = find_live_mirror(fs_info, map, + io_geom->stripe_index, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; +} + +static void map_blocks_raid56_write(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + u64 logical, u64 *length) +{ + int data_stripes = nr_data_stripes(map); + + /* + * Needs full stripe mapping. + * + * Push stripe_nr back to the start of the full stripe For those cases + * needing a full stripe, @stripe_nr is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, but + * that can be expensive. Here we just divide @stripe_nr with + * @data_stripes. + */ + io_geom->stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + io_geom->num_stripes = map->num_stripes; + io_geom->max_errors = btrfs_chunk_max_errors(map); + + /* Return the length to the full stripe end. */ + *length = min(logical + *length, + io_geom->raid56_full_stripe_start + map->start + + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; + io_geom->stripe_index = 0; + io_geom->stripe_offset = 0; +} + +static void map_blocks_raid56_read(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + int data_stripes = nr_data_stripes(map); + + ASSERT(io_geom->mirror_num <= 1); + /* Just grab the data stripe directly. */ + io_geom->stripe_index = io_geom->stripe_nr % data_stripes; + io_geom->stripe_nr /= data_stripes; + + /* We distribute the parity blocks across stripes. */ + io_geom->stripe_index = + (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; + + if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) + io_geom->mirror_num = 1; +} + +static void map_blocks_single(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + io_geom->mirror_num = io_geom->stripe_index + 1; } /* @@ -6296,54 +6482,42 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * * For RAID6 profile, mirror > 2 means mark another * data/P stripe error and rebuild from the remaining * stripes.. - * - * @need_raid_map: (Used only for integrity checker) whether the map wants - * a full stripe map (including all data and P/Q stripes) - * for RAID56. Should always be 1 except integrity checker. */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) + struct btrfs_io_stripe *smap, int *mirror_num_ret) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; + struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - u64 stripe_offset; - u32 stripe_nr; - u32 stripe_index; - int data_stripes; - int i; int ret = 0; - int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); - int num_stripes; int num_copies; - int max_errors = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; - u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (mirror_num > num_copies) - return -EINVAL; + io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + io_geom.num_stripes = 1; + io_geom.stripe_index = 0; + io_geom.op = op; - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); - map = em->map_lookup; - data_stripes = nr_data_stripes(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; - map_offset = logical - em->start; - max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, - &stripe_offset, &raid56_full_stripe_start); - *length = min_t(u64, em->len - map_offset, max_len); + map_offset = logical - map->start; + io_geom.raid56_full_stripe_start = (u64)-1; + max_len = btrfs_max_io_len(map, map_offset, &io_geom); + *length = min_t(u64, map->chunk_len - map_offset, max_len); if (dev_replace->replace_task != current) down_read(&dev_replace->rwsem); @@ -6356,110 +6530,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); - num_stripes = 1; - stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - if (op == BTRFS_MAP_READ) - mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - stripe_index = find_live_mirror(fs_info, map, 0, - dev_replace_is_ongoing); - mirror_num = stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - mirror_num = 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u32 factor = map->num_stripes / map->sub_stripes; - - stripe_index = (stripe_nr % factor) * map->sub_stripes; - stripe_nr /= factor; - - if (op != BTRFS_MAP_READ) - num_stripes = map->sub_stripes; - else if (mirror_num) - stripe_index += mirror_num - 1; - else { - int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(fs_info, map, - stripe_index, - dev_replace_is_ongoing); - mirror_num = stripe_index - old_stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { - /* - * Push stripe_nr back to the start of the full stripe - * For those cases needing a full stripe, @stripe_nr - * is the full stripe number. - * - * Originally we go raid56_full_stripe_start / full_stripe_len, - * but that can be expensive. Here we just divide - * @stripe_nr with @data_stripes. - */ - stripe_nr /= data_stripes; - - /* RAID[56] write or recovery. Return all stripes */ - num_stripes = map->num_stripes; - max_errors = btrfs_chunk_max_errors(map); - - /* Return the length to the full stripe end */ - *length = min(logical + *length, - raid56_full_stripe_start + em->start + - btrfs_stripe_nr_to_offset(data_stripes)) - - logical; - stripe_index = 0; - stripe_offset = 0; - } else { - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ - stripe_index = stripe_nr % data_stripes; - stripe_nr /= data_stripes; - if (mirror_num > 1) - stripe_index = data_stripes + mirror_num - 2; - - /* We distribute the parity blocks across stripes */ - stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num <= 1) - mirror_num = 1; - } - } else { + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID0: + map_blocks_raid0(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_DUP: + map_blocks_dup(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID10: + map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) + map_blocks_raid56_write(map, &io_geom, logical, length); + else + map_blocks_raid56_read(map, &io_geom); + break; + default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - mirror_num = stripe_index + 1; + map_blocks_single(map, &io_geom); + break; } - if (stripe_index >= map->num_stripes) { + if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", - stripe_index, map->num_stripes); + io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } - num_alloc_stripes = num_stripes; + num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* @@ -6476,17 +6586,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (smap && num_alloc_stripes == 1 && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { - set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, + io_geom.mirror_num)) { + ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) - *mirror_num_ret = mirror_num; + *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; - ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; @@ -6500,8 +6609,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * * It's still mostly the same as other profiles, just with extra rotation. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - (op != BTRFS_MAP_READ || mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && + (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6510,37 +6619,52 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ - bioc->full_stripe_logical = em->start + - btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (i = 0; i < num_stripes; i++) - set_io_stripe(&bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); + bioc->full_stripe_logical = map->start + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * + nr_data_stripes(map)); + for (int i = 0; i < io_geom.num_stripes; i++) { + struct btrfs_io_stripe *dst = &bioc->stripes[i]; + u32 stripe_index; + + stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; + dst->dev = map->stripes[stripe_index].dev; + dst->physical = + map->stripes[stripe_index].physical + + io_geom.stripe_offset + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); + } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ - for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); - stripe_index++; + for (int i = 0; i < io_geom.num_stripes; i++) { + ret = set_io_stripe(fs_info, logical, length, + &bioc->stripes[i], map, &io_geom); + if (ret < 0) + break; + io_geom.stripe_index++; } } + if (ret) { + *bioc_ret = NULL; + btrfs_put_bioc(bioc); + goto out; + } + if (op != BTRFS_MAP_READ) - max_errors = btrfs_chunk_max_errors(map); + io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { - handle_ops_on_dev_replace(op, bioc, dev_replace, logical, - &num_stripes, &max_errors); + handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); } *bioc_ret = bioc; - bioc->num_stripes = num_stripes; - bioc->max_errors = max_errors; - bioc->mirror_num = mirror_num; + bioc->num_stripes = io_geom.num_stripes; + bioc->max_errors = io_geom.max_errors; + bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing && dev_replace->replace_task != current) { @@ -6548,7 +6672,7 @@ out: /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -6720,12 +6844,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -u64 btrfs_calc_stripe_length(const struct extent_map *em) +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { - const struct map_lookup *map = em->map_lookup; const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(em->len, data_stripes); + return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 @@ -6794,9 +6917,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; @@ -6830,35 +6951,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, logical, 1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ - if (em && em->start <= logical && em->start + em->len > logical) { - free_extent_map(em); + if (map && map->start <= logical && map->start + map->chunk_len > logical) { + btrfs_free_chunk_map(map); return 0; - } else if (em) { - free_extent_map(em); + } else if (map) { + btrfs_free_chunk_map(map); } - em = alloc_extent_map(); - if (!em) - return -ENOMEM; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - free_extent_map(em); + map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); + if (!map) return -ENOMEM; - } - - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = logical; - em->len = length; - em->orig_start = 0; - em->block_start = 0; - em->block_len = em->len; + map->start = logical; + map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); @@ -6873,7 +6981,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - em->orig_block_len = btrfs_calc_stripe_length(em); + map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6889,7 +6997,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } } @@ -6898,15 +7006,13 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, &(map->stripes[i].dev->dev_state)); } - write_lock(&map_tree->lock); - ret = add_extent_mapping(map_tree, em, 0); - write_unlock(&map_tree->lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", - em->start, em->len, ret); + map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } - free_extent_map(em); return ret; } @@ -6954,7 +7060,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid, NULL); + fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -7216,26 +7322,21 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - u64 next_start = 0; + struct btrfs_chunk_map *map; + u64 next_start; bool ret = true; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, 0, (u64)-1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!em) { + if (!map) { ret = false; goto out; } - while (em) { - struct map_lookup *map; + while (map) { int missing = 0; int max_tolerated; int i; - map = em->map_lookup; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); @@ -7253,18 +7354,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", - em->start, missing, max_tolerated); - free_extent_map(em); + map->start, missing, max_tolerated); + btrfs_free_chunk_map(map); ret = false; goto out; } - next_start = extent_map_end(em); - free_extent_map(em); + next_start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, next_start, - (u64)(-1) - next_start); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; @@ -7757,20 +7855,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7778,12 +7871,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } - map = em->map_lookup; - stripe_len = btrfs_calc_stripe_length(em); + stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", - physical_offset, devid, em->start, physical_len, + physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; @@ -7806,7 +7898,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", - em->start); + map->start); ret = -EUCLEAN; goto out; } @@ -7852,32 +7944,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; struct rb_node *node; int ret = 0; - read_lock(&em_tree->lock); - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->map_lookup->num_stripes != - em->map_lookup->verified_stripes) { + read_lock(&fs_info->mapping_tree_lock); + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; + + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", - em->start, em->map_lookup->verified_stripes, - em->map_lookup->num_stripes); + map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } @@ -8129,7 +8219,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + &bioc, smap, &mirror_ret); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5203095318b0..4481575dd70f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,13 +6,28 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H +#include <linux/blk_types.h> +#include <linux/sizes.h> +#include <linux/atomic.h> #include <linux/sort.h> -#include <linux/btrfs.h> -#include "async-thread.h" +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/kobject.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> #include "messages.h" -#include "tree-checker.h" #include "rcu-string.h" +struct block_device; +struct bdev_handle; +struct btrfs_fs_info; +struct btrfs_block_group; +struct btrfs_trans_handle; +struct btrfs_zoned_device_info; + #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) /* @@ -83,7 +98,10 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) -struct btrfs_zoned_device_info; +/* Special value encoding failure to write primary super block. */ +#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2) + +struct btrfs_fs_devices; struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -96,13 +114,11 @@ struct btrfs_device { u64 generation; + struct file *bdev_file; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; - /* block device holder for blkdev_get/put */ - void *holder; - /* * Device's major-minor number. Must be set even if the device is not * opened (bdev == NULL), unless the device is missing. @@ -135,6 +151,12 @@ struct btrfs_device { /* type and info about this device */ u64 type; + /* + * Counter of super block write errors, values larger than + * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure. + */ + atomic_t sb_write_errors; + /* minimal io size for this device */ u32 sector_size; @@ -284,6 +306,25 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; +#ifdef CONFIG_BTRFS_DEBUG +/* + * Checksum mode - offload it to workqueues or do it synchronously in + * btrfs_submit_chunk(). + */ +enum btrfs_offload_csum_mode { + /* + * Choose offloading checksum or do it synchronously automatically. + * Do it synchronously if the checksum is fast, or offload to workqueues + * otherwise. + */ + BTRFS_OFFLOAD_CSUM_AUTO, + /* Always offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_ON, + /* Never offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_OFF, +}; +#endif + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -296,6 +337,19 @@ struct btrfs_fs_devices { * - Following shall be true at all times: * - metadata_uuid == btrfs_header::fsid * - metadata_uuid == btrfs_dev_item::fsid + * + * - Relations between fsid and metadata_uuid in sb and fs_devices: + * - Normal: + * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid + * sb->metadata_uuid == 0 + * + * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set: + * fs_devices->fsid == sb->fsid + * fs_devices->metadata_uuid == sb->metadata_uuid + * + * - When in-memory fs_devices->temp_fsid is true + * fs_devices->fsid = random + * fs_devices->metadata_uuid == sb->fsid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -359,9 +413,10 @@ struct btrfs_fs_devices { bool rotating; /* Devices support TRIM/discard commands. */ bool discardable; - bool fsid_change; /* The filesystem is a seed filesystem. */ bool seeding; + /* The mount needs to use a randomly generated fsid. */ + bool temp_fsid; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -374,6 +429,11 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; + +#ifdef CONFIG_BTRFS_DEBUG + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; +#endif }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -387,12 +447,12 @@ struct btrfs_fs_devices { struct btrfs_io_stripe { struct btrfs_device *dev; - union { - /* Block mapping */ - u64 physical; - /* For the endio handler */ - struct btrfs_io_context *bioc; - }; + /* Block mapping. */ + u64 physical; + u64 length; + bool rst_search_commit_root; + /* For the endio handler. */ + struct btrfs_io_context *bioc; }; struct btrfs_discard_stripe { @@ -420,11 +480,17 @@ struct btrfs_discard_stripe { struct btrfs_io_context { refcount_t refs; struct btrfs_fs_info *fs_info; - u64 map_type; /* get from map_lookup->type */ + /* Taken from struct btrfs_chunk_map::type. */ + u64 map_type; struct bio *orig_bio; atomic_t error; u16 max_errors; + u64 logical; + u64 size; + /* Raid stripe tree ordered entry. */ + struct list_head rst_ordered_entry; + /* * The total number of stripes, including the extra duplicated * stripe for replace. @@ -518,21 +584,33 @@ struct btrfs_raid_attr { extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -struct map_lookup { +struct btrfs_chunk_map { + struct rb_node rb_node; + /* For mount time dev extent verification. */ + int verified_stripes; + refcount_t refs; + u64 start; + u64 chunk_len; + u64 stripe_size; u64 type; int io_align; int io_width; int num_stripes; int sub_stripes; - int verified_stripes; /* For mount time dev extent verification */ struct btrfs_io_stripe stripes[]; }; -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_io_stripe) * (n))) +#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \ + (sizeof(struct btrfs_io_stripe) * (n))) + +static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) +{ + if (map && refcount_dec_and_test(&map->refs)) { + ASSERT(RB_EMPTY_NODE(&map->rb_node)); + kfree(map); + } +} -struct btrfs_balance_args; -struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_balance_args data; struct btrfs_balance_args meta; @@ -587,7 +665,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * Do the type safe converstion from stripe_nr to offset inside the chunk. + * Do the type safe conversion from stripe_nr to offset inside the chunk. * * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger * than 4G. This does the proper type cast to avoid overflow. @@ -602,8 +680,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_stripe *smap, int *mirror_num_ret); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -614,10 +691,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_tree_free(struct extent_map_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -635,7 +713,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder); + struct file **bdev_file); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -653,8 +731,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, @@ -669,13 +745,24 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -u64 btrfs_calc_stripe_length(const struct extent_map *em); +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +#endif + +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, @@ -742,9 +829,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device); enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); @@ -753,6 +838,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); -u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); +const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b906f809650e..ce464cd8e0ac 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -24,7 +24,7 @@ #include "accessors.h" #include "dir-item.h" -int btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size) { struct btrfs_dir_item *di; @@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) @@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, return btrfs_setxattr_trans(inode, name, buffer, size, flags); } +static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, + struct dentry *unused, + struct inode *inode, + const char *name, void *buffer, + size_t size) +{ + int ret; + bool is_cap = false; + + name = xattr_full_name(handler, name); + + /* + * security.capability doesn't cache the results, so calls into us + * constantly to see if there's a capability xattr. Cache the result + * here in order to avoid wasting time doing lookups for xattrs we know + * don't exist. + */ + if (strcmp(name, XATTR_NAME_CAPS) == 0) { + is_cap = true; + if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) + return -ENODATA; + } + + ret = btrfs_getxattr(inode, name, buffer, size); + if (ret == -ENODATA && is_cap) + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + return ret; +} + +static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, + struct inode *inode, + const char *name, + const void *buffer, + size_t size, int flags) +{ + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + + name = xattr_full_name(handler, name); + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + return btrfs_setxattr_trans(inode, name, buffer, size, flags); +} + static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -404,11 +451,11 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_set_prop(trans, inode, name, value, size, flags); + ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags); if (!ret) { inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } @@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = btrfs_xattr_handler_get, - .set = btrfs_xattr_handler_set, + .get = btrfs_xattr_handler_get_security, + .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { @@ -442,7 +489,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { .set = btrfs_xattr_handler_set_prop, }; -const struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, @@ -457,7 +504,7 @@ static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr; unsigned int nofs_flag; char *name; - int err = 0; + int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation @@ -468,19 +515,23 @@ static int btrfs_initxattrs(struct inode *inode, name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { - err = -ENOMEM; + ret = -ENOMEM; break; } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = btrfs_setxattr(trans, inode, name, xattr->value, + + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + ret = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); - if (err < 0) + if (ret < 0) break; } memalloc_nofs_restore(nofs_flag); - return err; + return ret; } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 1cd3fc0a8f17..8dc4cf49f6f0 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,11 +6,15 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H -#include <linux/xattr.h> +struct dentry; +struct inode; +struct qstr; +struct xattr_handler; +struct btrfs_trans_handle; -extern const struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler * const btrfs_xattr_handlers[]; -int btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size); int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 9f60d0bbd530..03958d1a53b0 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -18,7 +18,10 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/refcount.h> +#include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -91,29 +94,35 @@ fail: return ERR_PTR(-ENOMEM); } -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; char *data_in = NULL; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; + char *cfolio_out; + int nr_folios = 0; + struct folio *in_folio = NULL; + struct folio *out_folio = NULL; unsigned long bytes_left; - unsigned int in_buf_pages; + unsigned int in_buf_folios; unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; + unsigned long nr_dest_folios = *out_folios; + const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { - pr_warn("BTRFS: deflateInit failed\n"); + ret = zlib_deflateInit(&workspace->strm, workspace->level); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, + "zlib compression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } @@ -121,18 +130,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[0] = out_page; - nr_pages = 1; + cfolio_out = folio_address(out_folio); + folios[0] = out_folio; + nr_folios = 1; workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; workspace->strm.avail_out = PAGE_SIZE; while (workspace->strm.total_in < len) { @@ -142,43 +151,63 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, */ if (workspace->strm.avail_in == 0) { bytes_left = len - workspace->strm.total_in; - in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_pages > 1) { + in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_folios > 1) { int i; - for (i = 0; i < in_buf_pages; i++) { + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); + for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); start += PAGE_SIZE; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); - if (ret != Z_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", - ret); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_warn(inode->root->fs_info, + "zlib compression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + start); zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; @@ -196,20 +225,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } /* we're all done */ if (workspace->strm.total_in >= len) @@ -231,21 +260,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -EIO; goto out; } else if (workspace->strm.avail_out == 0) { - /* get another page for the stream end */ - if (nr_pages == nr_dest_pages) { + /* Get another folio for the stream end. */ + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } } zlib_deflateEnd(&workspace->strm); @@ -259,10 +288,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = workspace->strm.total_out; *total_in = workspace->strm.total_in; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); } return ret; @@ -275,13 +304,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long page_in_index = 0; + unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -301,9 +330,14 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { - pr_warn("BTRFS: inflateInit failed\n"); + ret = zlib_inflateInit2(&workspace->strm, wbits); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = cb->bbio.inode; + kunmap_local(data_in); + btrfs_err(inode->root->fs_info, + "zlib decompression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -331,21 +365,26 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { data_in = NULL; break; } - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); } } - if (ret != Z_STREAM_END) + if (unlikely(ret != Z_STREAM_END)) { + btrfs_err(cb->bbio.inode->root->fs_info, + "zlib decompression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(cb->bbio.inode->root), + btrfs_ino(cb->bbio.inode), cb->start); ret = -EIO; - else + } else { ret = 0; + } done: zlib_inflateEnd(&workspace->strm); if (data_in) @@ -354,7 +393,7 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -380,8 +419,14 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { - pr_warn("BTRFS: inflateInit failed\n"); + ret = zlib_inflateInit2(&workspace->strm, wbits); + if (unlikely(ret != Z_OK)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zlib decompression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); return -EIO; } @@ -394,12 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", - to_copy, destlen); + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, +"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -408,7 +457,7 @@ out: zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c4463c3f2068..53d8c49ec058 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,10 +12,8 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" -#include "transaction.h" #include "dev-replace.h" #include "space-info.h" -#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -89,9 +87,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; - int i; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); @@ -120,12 +117,11 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, return -ENOENT; } else if (full[0] && full[1]) { /* Compare two super blocks */ - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; struct page *page[BTRFS_NR_SB_LOG_ZONES]; struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; - int i; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - BTRFS_SUPER_INFO_SIZE; @@ -146,7 +142,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, else sector = zones[0].start; - for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) + for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) btrfs_release_disk_super(super[i]); } else if (!full[0] && (empty[1] || full[1])) { sector = zones[0].wp; @@ -291,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -308,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -578,26 +567,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) kvfree(zones); - switch (bdev_zoned_model(bdev)) { - case BLK_ZONED_HM: + if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; - break; - case BLK_ZONED_HA: - model = "host-aware zoned"; - emulated = ""; - break; - case BLK_ZONED_NONE: + } else { model = "regular"; emulated = "emulated "; - break; - default: - /* Just in case */ - btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", - bdev_zoned_model(bdev), - rcu_str_deref(device->name)); - ret = -EOPNOTSUPP; - goto out_free_zone_info; } btrfs_info_in_rcu(fs_info, @@ -609,9 +584,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) out: kvfree(zones); -out_free_zone_info: btrfs_destroy_dev_zone_info(device); - return ret; } @@ -670,8 +643,7 @@ out: return NULL; } -int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone) +static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { unsigned int nr_zones = 1; int ret; @@ -688,8 +660,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { - if (device->bdev && - bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); @@ -781,7 +752,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ - ret = btrfs_check_mountopts_zoned(fs_info); + ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; @@ -789,7 +760,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; @@ -798,18 +770,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ - if (btrfs_test_opt(info, SPACE_CACHE)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } - if (btrfs_test_opt(info, NODATACOW)) { + if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } - btrfs_clear_and_info(info, DISCARD_ASYNC, - "zoned: async discard ignored and disabled for zoned mode"); + if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { + btrfs_info(info, + "zoned: async discard ignored and disabled for zoned mode"); + btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); + } return 0; } @@ -838,11 +813,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -988,11 +966,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -1010,11 +991,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1025,9 +1008,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1138,12 +1124,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1216,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1251,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1259,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1271,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { @@ -1290,7 +1273,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct map_lookup *map) + struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1400,16 +1383,19 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { - if (map->type & BTRFS_BLOCK_GROUP_DATA) { - btrfs_err(bg->fs_info, - "zoned: profile DUP not yet supported on data bg"); + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); return -EINVAL; } + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", @@ -1436,16 +1422,128 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } bg->alloc_offset = zone_info[0].alloc_offset; - bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); + return 0; +} + +static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + int i; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in %s profile", + btrfs_bg_type_to_raid_name(map->type)); + return -EIO; + } + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg)) { + return -EIO; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + } + + if (zone_info[0].alloc_offset != WP_MISSING_DEV) + bg->alloc_offset = zone_info[0].alloc_offset; + else + bg->alloc_offset = zone_info[i - 1].alloc_offset; + + return 0; +} + +static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + + return 0; +} + +static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + if ((i % map->sub_stripes) == 0) { + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + } + return 0; } int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; @@ -1454,6 +1552,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; + u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1466,21 +1565,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EIO; } - /* Get the chunk mapping */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); - - if (!em) + map = btrfs_find_chunk_map(fs_info, logical, length); + if (!map) return -EINVAL; - map = em->map_lookup; - - cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@ -1524,7 +1613,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; @@ -1532,11 +1622,18 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_dup(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: - /* non-single profiles are not supported yet */ default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); @@ -1544,12 +1641,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + cache->alloc_offset = cache->zone_capacity; + } + out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; + /* Reject non SINGLE data profiles without RST */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; } if (cache->alloc_offset > cache->zone_capacity) { @@ -1578,12 +1693,11 @@ out: spin_unlock(&fs_info->zone_active_bgs_lock); } } else { - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); - free_extent_map(em); return ret; } @@ -1606,22 +1720,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) cache->zone_unusable = unusable; } -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) -{ - if (!btrfs_is_zoned(eb->fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) - return; - - ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); - set_extent_buffer_dirty(eb); - set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY | EXTENT_NOWAIT, NULL); -} - bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); @@ -1633,7 +1731,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) if (!btrfs_is_zoned(fs_info)) return false; - if (!inode || !is_data_inode(&inode->vfs_inode)) + if (!inode || !is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) @@ -1675,7 +1773,7 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio) static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, u64 logical) { - struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; + struct extent_map_tree *em_tree = &ordered->inode->extent_tree; struct extent_map *em; ordered->disk_bytenr = logical; @@ -1683,7 +1781,9 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = logical; + /* The em should be a new COW extent, thus it should not have an offset. */ + ASSERT(em->offset == 0); + em->disk_bytenr = logical; free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1694,7 +1794,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + split_extent_map(ordered->inode, ordered->file_offset, ordered->num_bytes, len, logical)) return false; @@ -1708,7 +1808,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sum; u64 logical, len; @@ -1752,7 +1852,7 @@ out: * here so that we don't attempt to log the csums later. */ if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { while ((sum = list_first_entry_or_null(&ordered->list, typeof(*sum), list))) { list_del(&sum->list); @@ -1897,7 +1997,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc, NULL, NULL, 1); + &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; @@ -1973,7 +2073,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); @@ -2006,6 +2106,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2081,9 +2184,10 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; @@ -2121,8 +2225,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* Ensure all writes in this block group finish */ btrfs_wait_block_group_reservations(block_group); /* No need to wait for NOCOW writers. Zoned mode does not allow that */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); /* Wait for extent buffers to be written. */ if (is_metadata) wait_eb_writebacks(block_group); @@ -2159,27 +2262,36 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; + + if (!device->bdev) + continue; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); - if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); @@ -2340,12 +2452,12 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + u64 total = btrfs_super_total_bytes(fs_info->super_copy); u64 used = 0; - u64 total = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); @@ -2358,7 +2470,6 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - total += device->disk_total_bytes; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); @@ -2412,7 +2523,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; @@ -2530,7 +2641,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - struct map_lookup *map = block_group->physical_map; + struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b9cec523b778..7612e6572605 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,12 +4,27 @@ #define BTRFS_ZONED_H #include <linux/types.h> +#include <linux/atomic.h> #include <linux/blkdev.h> +#include <linux/blkzoned.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> #include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" +#include "fs.h" + +struct block_device; +struct extent_buffer; +struct btrfs_bio; +struct btrfs_ordered_extent; +struct btrfs_fs_info; +struct btrfs_space_info; +struct btrfs_eb_write_context; +struct btrfs_fs_devices; #define BTRFS_DEFAULT_RECLAIM_THRESH (75) @@ -38,14 +53,13 @@ struct btrfs_zoned_device_info { void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); #ifdef CONFIG_BLK_DEV_ZONED -int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, @@ -59,8 +73,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -77,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); @@ -85,11 +97,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ -static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, - struct blk_zone *zone) -{ - return 0; -} static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { @@ -123,7 +130,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } -static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +static inline int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, + unsigned long long *mount_opt) { return 0; } @@ -180,9 +188,6 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } -static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) { } - static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; @@ -237,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } @@ -323,8 +328,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } - /* Do not allow Host Manged zoned device */ - return bdev_zoned_model(bdev) != BLK_ZONED_HM; + /* Do not allow Host Managed zoned device. */ + return !bdev_is_zoned(bdev); } static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e7ac4ec809a4..c9ea37fabf65 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -18,11 +18,13 @@ #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" +#include "fs.h" +#include "btrfs_inode.h" #include "compression.h" -#include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ @@ -145,7 +147,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) } /* - * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse @@ -218,7 +220,8 @@ void zstd_cleanup_workspace_manager(void) } /* - * zstd_find_workspace - find workspace + * Find workspace for given level. + * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested @@ -256,7 +259,8 @@ static struct list_head *zstd_find_workspace(unsigned int level) } /* - * zstd_get_workspace - zstd's get_workspace + * Zstd get_workspace for level. + * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin @@ -296,7 +300,8 @@ again: } /* - * zstd_put_workspace - zstd put_workspace + * Zstd put_workspace. + * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of @@ -370,52 +375,63 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); zstd_cstream *stream; int ret = 0; - int nr_pages = 0; - struct page *in_page = NULL; /* The current page to read */ - struct page *out_page = NULL; /* The current page to write to */ + int nr_folios = 0; + struct folio *in_folio = NULL; /* The current folio to read. */ + struct folio *out_folio = NULL; /* The current folio to write to. */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; - const unsigned long nr_dest_pages = *out_pages; - unsigned long max_out = nr_dest_pages * PAGE_SIZE; + const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; + unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ stream = zstd_init_cstream(¶ms, len, workspace->mem, workspace->size); - if (!stream) { - pr_warn("BTRFS: zstd_init_cstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, + "zstd compression init level %d failed, root %llu inode %llu offset %llu", + workspace->req_level, btrfs_root_id(inode->root), + btrfs_ino(inode), start); ret = -EIO; goto out; } /* map in the first page of input data */ - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); - + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -424,9 +440,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_compress_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_warn(inode->root->fs_info, +"zstd compression level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start); ret = -EIO; goto out; } @@ -450,17 +471,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -476,22 +497,32 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap_local(workspace->in_buf.src); - put_page(in_page); - start += PAGE_SIZE; - len -= PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + workspace->in_buf.src = NULL; + folio_put(in_folio); + start += cur_len; + len -= cur_len; + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { size_t ret2; ret2 = zstd_end_stream(stream, &workspace->out_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_end_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_err(inode->root->fs_info, +"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start); ret = -EIO; goto out; } @@ -507,17 +538,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -531,10 +562,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = tot_in; *total_out = tot_out; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); - put_page(in_page); + folio_put(in_folio); } return ret; } @@ -542,24 +573,28 @@ out: int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; - unsigned long page_in_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long folio_in_index = 0; + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); - if (!stream) { - pr_debug("BTRFS: zstd_init_dstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(inode->root->fs_info, + "zstd decompression init failed, root %llu inode %llu offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -572,9 +607,13 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); + if (unlikely(zstd_is_error(ret2))) { + struct btrfs_inode *inode = cb->bbio.inode; + + btrfs_err(inode->root->fs_info, + "zstd decompression failed, error %d root %llu inode %llu offset %llu", + zstd_get_error_code(ret2), btrfs_root_id(inode->root), + btrfs_ino(inode), cb->start); ret = -EIO; goto done; } @@ -596,14 +635,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = + kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -616,80 +656,58 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); - if (!stream) { - pr_warn("BTRFS: zstd_init_dstream failed\n"); + if (unlikely(!stream)) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zstd decompression init failed, root %llu inode %llu offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(dest_folio)); ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (unlikely(zstd_is_error(ret))) { + struct btrfs_inode *inode = folio_to_inode(dest_folio); + + btrfs_err(inode->root->fs_info, + "zstd decompression failed, error %d root %llu inode %llu offset %llu", + zstd_get_error_code(ret), btrfs_root_id(inode->root), + btrfs_ino(inode), folio_pos(dest_folio)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } |