summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig21
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/accessors.c103
-rw-r--r--fs/btrfs/accessors.h77
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/acl.h11
-rw-r--r--fs/btrfs/async-thread.c13
-rw-r--r--fs/btrfs/async-thread.h9
-rw-r--r--fs/btrfs/backref.c178
-rw-r--r--fs/btrfs/backref.h150
-rw-r--r--fs/btrfs/bio.c217
-rw-r--r--fs/btrfs/bio.h15
-rw-r--r--fs/btrfs/block-group.c516
-rw-r--r--fs/btrfs/block-group.h34
-rw-r--r--fs/btrfs/block-rsv.c38
-rw-r--r--fs/btrfs/block-rsv.h9
-rw-r--r--fs/btrfs/btrfs_inode.h275
-rw-r--r--fs/btrfs/check-integrity.c2871
-rw-r--r--fs/btrfs/check-integrity.h20
-rw-r--r--fs/btrfs/compression.c344
-rw-r--r--fs/btrfs/compression.h67
-rw-r--r--fs/btrfs/ctree.c436
-rw-r--r--fs/btrfs/ctree.h222
-rw-r--r--fs/btrfs/defrag.c388
-rw-r--r--fs/btrfs/defrag.h15
-rw-r--r--fs/btrfs/delalloc-space.c7
-rw-r--r--fs/btrfs/delalloc-space.h6
-rw-r--r--fs/btrfs/delayed-inode.c216
-rw-r--r--fs/btrfs/delayed-inode.h32
-rw-r--r--fs/btrfs/delayed-ref.c700
-rw-r--r--fs/btrfs/delayed-ref.h284
-rw-r--r--fs/btrfs/dev-replace.c89
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h21
-rw-r--r--fs/btrfs/direct-io.c1079
-rw-r--r--fs/btrfs/direct-io.h14
-rw-r--r--fs/btrfs/discard.c55
-rw-r--r--fs/btrfs/disk-io.c670
-rw-r--r--fs/btrfs/disk-io.h40
-rw-r--r--fs/btrfs/export.c17
-rw-r--r--fs/btrfs/export.h4
-rw-r--r--fs/btrfs/extent-io-tree.c474
-rw-r--r--fs/btrfs/extent-io-tree.h70
-rw-r--r--fs/btrfs/extent-tree.c1682
-rw-r--r--fs/btrfs/extent-tree.h28
-rw-r--r--fs/btrfs/extent_io.c3244
-rw-r--r--fs/btrfs/extent_io.h158
-rw-r--r--fs/btrfs/extent_map.c885
-rw-r--r--fs/btrfs/extent_map.h155
-rw-r--r--fs/btrfs/fiemap.c930
-rw-r--r--fs/btrfs/fiemap.h11
-rw-r--r--fs/btrfs/file-item.c165
-rw-r--r--fs/btrfs/file-item.h18
-rw-r--r--fs/btrfs/file.c782
-rw-r--r--fs/btrfs/file.h19
-rw-r--r--fs/btrfs/free-space-cache.c49
-rw-r--r--fs/btrfs/free-space-cache.h14
-rw-r--r--fs/btrfs/free-space-tree.c72
-rw-r--r--fs/btrfs/free-space-tree.h6
-rw-r--r--fs/btrfs/fs.h210
-rw-r--r--fs/btrfs/inode-item.c32
-rw-r--r--fs/btrfs/inode-item.h15
-rw-r--r--fs/btrfs/inode.c3701
-rw-r--r--fs/btrfs/ioctl.c409
-rw-r--r--fs/btrfs/ioctl.h11
-rw-r--r--fs/btrfs/locking.c49
-rw-r--r--fs/btrfs/locking.h23
-rw-r--r--fs/btrfs/lru_cache.c2
-rw-r--r--fs/btrfs/lru_cache.h6
-rw-r--r--fs/btrfs/lzo.c168
-rw-r--r--fs/btrfs/messages.c44
-rw-r--r--fs/btrfs/messages.h16
-rw-r--r--fs/btrfs/misc.h6
-rw-r--r--fs/btrfs/ordered-data.c359
-rw-r--r--fs/btrfs/ordered-data.h71
-rw-r--r--fs/btrfs/orphan.c25
-rw-r--r--fs/btrfs/orphan.h5
-rw-r--r--fs/btrfs/print-tree.c35
-rw-r--r--fs/btrfs/print-tree.h3
-rw-r--r--fs/btrfs/props.c26
-rw-r--r--fs/btrfs/props.h11
-rw-r--r--fs/btrfs/qgroup.c1302
-rw-r--r--fs/btrfs/qgroup.h177
-rw-r--r--fs/btrfs/raid-stripe-tree.c296
-rw-r--r--fs/btrfs/raid-stripe-tree.h54
-rw-r--r--fs/btrfs/raid56.c152
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c20
-rw-r--r--fs/btrfs/ref-verify.h9
-rw-r--r--fs/btrfs/reflink.c110
-rw-r--r--fs/btrfs/reflink.h4
-rw-r--r--fs/btrfs/relocation.c733
-rw-r--r--fs/btrfs/relocation.h9
-rw-r--r--fs/btrfs/root-tree.c28
-rw-r--r--fs/btrfs/root-tree.h18
-rw-r--r--fs/btrfs/scrub.c235
-rw-r--r--fs/btrfs/scrub.h6
-rw-r--r--fs/btrfs/send.c196
-rw-r--r--fs/btrfs/send.h10
-rw-r--r--fs/btrfs/space-info.c322
-rw-r--r--fs/btrfs/space-info.h66
-rw-r--r--fs/btrfs/subpage.c645
-rw-r--r--fs/btrfs/subpage.h164
-rw-r--r--fs/btrfs/super.c2471
-rw-r--r--fs/btrfs/super.h13
-rw-r--r--fs/btrfs/sysfs.c205
-rw-r--r--fs/btrfs/sysfs.h9
-rw-r--r--fs/btrfs/tests/btrfs-tests.c13
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c20
-rw-r--r--fs/btrfs/tests/extent-map-tests.c517
-rw-r--r--fs/btrfs/tests/inode-tests.c276
-rw-r--r--fs/btrfs/transaction.c341
-rw-r--r--fs/btrfs/transaction.h45
-rw-r--r--fs/btrfs/tree-checker.c87
-rw-r--r--fs/btrfs/tree-checker.h5
-rw-r--r--fs/btrfs/tree-log.c817
-rw-r--r--fs/btrfs/tree-log.h53
-rw-r--r--fs/btrfs/tree-mod-log.c27
-rw-r--r--fs/btrfs/tree-mod-log.h14
-rw-r--r--fs/btrfs/ulist.c25
-rw-r--r--fs/btrfs/ulist.h3
-rw-r--r--fs/btrfs/uuid-tree.c194
-rw-r--r--fs/btrfs/uuid-tree.h11
-rw-r--r--fs/btrfs/verity.c25
-rw-r--r--fs/btrfs/verity.h7
-rw-r--r--fs/btrfs/volumes.c1800
-rw-r--r--fs/btrfs/volumes.h157
-rw-r--r--fs/btrfs/xattr.c77
-rw-r--r--fs/btrfs/xattr.h10
-rw-r--r--fs/btrfs/zlib.c191
-rw-r--r--fs/btrfs/zoned.c377
-rw-r--r--fs/btrfs/zoned.h41
-rw-r--r--fs/btrfs/zstd.c248
136 files changed, 19312 insertions, 16609 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a25c9910d90b..4fb925e8c981 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
-config BTRFS_FS_CHECK_INTEGRITY
- bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
- depends on BTRFS_FS
- help
- This feature has been deprecated and will be removed in 6.7.
-
- Adds code that examines all block write requests (including
- writes of the super block). The goal is to verify that the
- state of the filesystem on disk is always consistent, i.e.,
- after a power-loss or kernel panic event the filesystem is
- in a consistent state.
-
- If the integrity check tool is included and activated in
- the mount options, plenty of kernel memory is used, and
- plenty of additional CPU cycles are spent. Enabling this
- functionality is not intended for normal use.
-
- In most cases, unless you are a btrfs developer who needs
- to verify the integrity of (super)-block write requests
- during the run of a regression test, say N
-
config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..87617f2968bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o
+ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..e3716516ca38 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -3,9 +3,10 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "messages.h"
-#include "ctree.h"
+#include "extent_io.h"
+#include "fs.h"
#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
@@ -27,7 +28,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
+ token->kaddr = folio_address(eb->folios[0]);
token->offset = 0;
}
@@ -50,7 +51,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
* phyusical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
@@ -60,28 +61,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ return get_unaligned_le##bits(token->kaddr + oil); \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(token->kaddr + oil); \
\
- memcpy(lebytes, token->kaddr + oip, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes, token->kaddr + oil, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(lebytes + part, token->kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -89,19 +92,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = eb->folio_size; \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
- return get_unaligned_le##bits(kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(kaddr + oil); \
\
- memcpy(lebytes, kaddr + oip, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes, kaddr + oil, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(lebytes + part, kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -110,53 +115,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oip, lebytes, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr + oil, lebytes, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(token->kaddr, lebytes + part, size - part); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = eb->folio_size; \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, kaddr + oil); \
return; \
} \
\
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oip, lebytes, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr + oil, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(kaddr, lebytes + part, size - part); \
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 8cfc8214109c..7a7e0ef69973 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -3,7 +3,17 @@
#ifndef BTRFS_ACCESSORS_H
#define BTRFS_ACCESSORS_H
+#include <linux/unaligned.h>
#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/align.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
struct btrfs_map_token {
struct extent_buffer *eb;
@@ -24,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
static inline u8 get_unaligned_le8(const void *p)
{
- return *(u8 *)p;
+ return *(const u8 *)p;
}
static inline void put_unaligned_le8(u8 val, void *p)
@@ -38,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
offsetof(type, member), \
sizeof_field(type, member)))
-#define write_eb_member(eb, ptr, type, member, result) (\
- write_extent_buffer(eb, (char *)(result), \
+#define write_eb_member(eb, ptr, type, member, source) ( \
+ write_extent_buffer(eb, (const char *)(source), \
((unsigned long)(ptr)) + \
offsetof(type, member), \
sizeof_field(type, member)))
@@ -89,14 +99,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]) + \
+ const type *p = folio_address(eb->folios[0]) + \
offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
@@ -305,6 +315,11 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
/* struct btrfs_dev_extent */
BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
@@ -335,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
@@ -349,6 +364,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3
BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
+ root_id, 64);
+
BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
type, 8);
BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
@@ -365,6 +383,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return sizeof(struct btrfs_extent_data_ref) +
offsetof(struct btrfs_extent_inline_ref, offset);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
return 0;
}
@@ -423,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
static inline void btrfs_set_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr;
@@ -489,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb,
}
static inline void btrfs_set_item_key(struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
struct btrfs_item *item = btrfs_item_nr(eb, nr);
@@ -830,45 +850,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
-static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- const struct btrfs_disk_balance_args *disk)
-{
- memset(cpu, 0, sizeof(*cpu));
-
- cpu->profiles = le64_to_cpu(disk->profiles);
- cpu->usage = le64_to_cpu(disk->usage);
- cpu->devid = le64_to_cpu(disk->devid);
- cpu->pstart = le64_to_cpu(disk->pstart);
- cpu->pend = le64_to_cpu(disk->pend);
- cpu->vstart = le64_to_cpu(disk->vstart);
- cpu->vend = le64_to_cpu(disk->vend);
- cpu->target = le64_to_cpu(disk->target);
- cpu->flags = le64_to_cpu(disk->flags);
- cpu->limit = le64_to_cpu(disk->limit);
- cpu->stripes_min = le32_to_cpu(disk->stripes_min);
- cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void btrfs_cpu_balance_args_to_disk(
- struct btrfs_disk_balance_args *disk,
- const struct btrfs_balance_args *cpu)
-{
- memset(disk, 0, sizeof(*disk));
-
- disk->profiles = cpu_to_le64(cpu->profiles);
- disk->usage = cpu_to_le64(cpu->usage);
- disk->devid = cpu_to_le64(cpu->devid);
- disk->pstart = cpu_to_le64(cpu->pstart);
- disk->pend = cpu_to_le64(cpu->pend);
- disk->vstart = cpu_to_le64(cpu->vstart);
- disk->vend = cpu_to_le64(cpu->vend);
- disk->target = cpu_to_le64(cpu->target);
- disk->flags = cpu_to_le64(cpu->flags);
- disk->limit = cpu_to_le64(cpu->limit);
- disk->stripes_min = cpu_to_le32(cpu->stripes_min);
- disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
@@ -966,6 +947,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
flags, 64);
BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
rescan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
+ enable_gen, 64);
/* btrfs_qgroup_info_item */
BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7427449a04a3..e0ba00d64ea0 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -12,7 +12,6 @@
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
-#include "btrfs_inode.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index a270e71ec05f..48b9ddae4a46 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+struct posix_acl;
+struct inode;
+struct btrfs_trans_handle;
+
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct mnt_idmap;
+struct dentry;
+
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
struct posix_acl *acl, int type);
@@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
#else
+#include <linux/errno.h>
+
+struct btrfs_trans_handle;
+
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ce083e99ef68..361a866c1995 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -9,8 +9,8 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <trace/events/btrfs.h>
#include "async-thread.h"
-#include "ctree.h"
enum {
WORK_DONE_BIT,
@@ -242,7 +242,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
break;
trace_btrfs_ordered_sched(work);
spin_unlock_irqrestore(lock, flags);
- work->ordered_func(work);
+ work->ordered_func(work, false);
/* now take the lock again and drop our item from the list */
spin_lock_irqsave(lock, flags);
@@ -277,7 +277,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
* We don't want to call the ordered free functions with
* the lock held.
*/
- work->ordered_free(work);
+ work->ordered_func(work, true);
/* NB: work must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, work);
}
@@ -285,7 +285,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_unlock_irqrestore(lock, flags);
if (free_self) {
- self->ordered_free(self);
+ self->ordered_func(self, true);
/* NB: self must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, self);
}
@@ -300,7 +300,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
/*
* We should not touch things inside work in the following cases:
- * 1) after work->func() if it has no ordered_free
+ * 1) after work->func() if it has no ordered_func(..., true) to free
* Since the struct is freed in work->func().
* 2) after setting WORK_DONE_BIT
* The work may be freed in other threads almost instantly.
@@ -329,11 +329,10 @@ static void btrfs_work_helper(struct work_struct *normal_work)
}
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+ btrfs_ordered_func_t ordered_func)
{
work->func = func;
work->ordered_func = ordered_func;
- work->ordered_free = ordered_free;
INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 30f66c5e2e6e..04c2f3175828 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -7,17 +7,20 @@
#ifndef BTRFS_ASYNC_THREAD_H
#define BTRFS_ASYNC_THREAD_H
+#include <linux/compiler_types.h>
#include <linux/workqueue.h>
+#include <linux/list.h>
struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
+
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
struct btrfs_work {
btrfs_func_t func;
- btrfs_func_t ordered_func;
- btrfs_func_t ordered_free;
+ btrfs_ordered_func_t ordered_func;
/* Don't touch things below */
struct work_struct normal_work;
@@ -35,7 +38,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags);
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+ btrfs_ordered_func_t ordered_func);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a2ba1c7fc16a..f8e1d5b2c512 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache;
int __init btrfs_prelim_ref_init(void)
{
btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
- sizeof(struct prelim_ref),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct prelim_ref), 0, 0, NULL);
if (!btrfs_prelim_ref_cache)
return -ENOMEM;
return 0;
@@ -222,8 +219,8 @@ static void free_pref(struct prelim_ref *ref)
* A -1 return indicates ref1 is a 'lower' block than ref2, while 1
* indicates a 'higher' block.
*/
-static int prelim_ref_compare(struct prelim_ref *ref1,
- struct prelim_ref *ref2)
+static int prelim_ref_compare(const struct prelim_ref *ref1,
+ const struct prelim_ref *ref2)
{
if (ref1->level < ref2->level)
return -1;
@@ -254,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
}
static void update_share_count(struct share_check *sc, int oldcount,
- int newcount, struct prelim_ref *newref)
+ int newcount, const struct prelim_ref *newref)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
@@ -264,7 +261,7 @@ static void update_share_count(struct share_check *sc, int oldcount,
else if (oldcount < 1 && newcount > 0)
sc->share_count++;
- if (newref->root_id == sc->root->root_key.objectid &&
+ if (newref->root_id == btrfs_root_id(sc->root) &&
newref->wanted_disk_byte == sc->data_bytenr &&
newref->key_for_search.objectid == sc->inum)
sc->self_ref_count += newref->count;
@@ -772,7 +769,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
continue;
}
- if (sc && ref->root_id != sc->root->root_key.objectid) {
+ if (sc && ref->root_id != btrfs_root_id(sc->root)) {
free_pref(ref);
ret = BACKREF_FOUND_SHARED;
goto out;
@@ -922,40 +919,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
struct btrfs_key *key_ptr = NULL;
+ /* The owner of a tree block ref is the level. */
+ int level = btrfs_delayed_ref_owner(node);
if (head->extent_op && head->extent_op->update_key) {
btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
key_ptr = &key;
}
- ref = btrfs_delayed_node_to_tree_ref(node);
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
- key_ptr, ref->level + 1,
- node->bytenr, count, sc,
- GFP_ATOMIC);
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
+ key_ptr, level + 1, node->bytenr,
+ count, sc, GFP_ATOMIC);
break;
}
case BTRFS_SHARED_BLOCK_REF_KEY: {
- /* SHARED DIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
-
- ref = btrfs_delayed_node_to_tree_ref(node);
+ /*
+ * SHARED DIRECT METADATA backref
+ *
+ * The owner of a tree block ref is the level.
+ */
+ int level = btrfs_delayed_ref_owner(node);
- ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
- ref->parent, node->bytenr, count,
+ ret = add_direct_ref(fs_info, preftrees, level + 1,
+ node->parent, node->bytenr, count,
sc, GFP_ATOMIC);
break;
}
case BTRFS_EXTENT_DATA_REF_KEY: {
/* NORMAL INDIRECT DATA backref */
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
+ key.objectid = btrfs_delayed_ref_owner(node);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
+ key.offset = btrfs_delayed_ref_offset(node);
/*
* If we have a share check context and a reference for
@@ -975,18 +970,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
if (sc && count < 0)
sc->have_delayed_delete_refs = true;
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
&key, 0, node->bytenr, count, sc,
GFP_ATOMIC);
break;
}
case BTRFS_SHARED_DATA_REF_KEY: {
/* SHARED DIRECT FULL backref */
- struct btrfs_delayed_data_ref *ref;
-
- ref = btrfs_delayed_node_to_data_ref(node);
-
- ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+ ret = add_direct_ref(fs_info, preftrees, 0, node->parent,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -1036,8 +1027,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
- BUG_ON(item_size < sizeof(*ei));
-
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
if (ctx->check_extent_item) {
@@ -1129,6 +1118,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
count, sc, GFP_NOFS);
break;
}
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA));
+ break;
default:
WARN_ON(1);
}
@@ -1432,8 +1424,10 @@ again:
if (ret < 0)
goto out;
if (ret == 0) {
- /* This shouldn't happen, indicates a bug or fs corruption. */
- ASSERT(ret != 0);
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto out;
}
@@ -2222,6 +2216,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ return -EUCLEAN;
+ }
ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
@@ -2244,7 +2245,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
eb = path->nodes[0];
item_size = btrfs_item_size(eb, path->slots[0]);
- BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2623,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
- fs_root->root_key.objectid);
+ btrfs_root_id(fs_root));
ret = inode_to_path(parent, name_len,
(unsigned long)(iref + 1), eb, ipath);
if (ret)
@@ -2841,6 +2841,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf
return ret;
}
+static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
@@ -2859,6 +2869,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
if (ret < 0)
return ret;
if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto release;
}
@@ -2929,6 +2943,14 @@ release:
return ret;
}
+static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
/*
* Go to the next backref item of current bytenr, can be either inlined or
* keyed.
@@ -2941,7 +2963,7 @@ release:
*/
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
- struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct extent_buffer *eb = iter->path->nodes[0];
struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
@@ -2992,7 +3014,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc)
+ struct btrfs_backref_cache *cache, bool is_reloc)
{
int i;
@@ -3029,6 +3051,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
return node;
}
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache)
{
@@ -3040,6 +3075,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge(
return edge;
}
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
/*
* Drop the backref node from cache, also cleaning up all its
* upper edges and any uncached nodes in the path.
@@ -3115,6 +3196,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
/*
* Handle direct tree backref
*
@@ -3265,7 +3359,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
btrfs_err(fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->root_key.objectid,
+ cur->bytenr, level - 1, btrfs_root_id(root),
tree_key->objectid, tree_key->type, tree_key->offset);
btrfs_put_root(root);
ret = -ENOENT;
@@ -3423,7 +3517,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
int type;
cond_resched();
- eb = btrfs_backref_get_eb(iter);
+ eb = iter->path->nodes[0];
key.objectid = iter->bytenr;
if (btrfs_backref_iter_is_inline_ref(iter)) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 71d535e03dca..7dfcc9351bce 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -6,11 +6,23 @@
#ifndef BTRFS_BACKREF_H
#define BTRFS_BACKREF_H
-#include <linux/btrfs.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "ulist.h"
+#include "locking.h"
#include "disk-io.h"
#include "extent_io.h"
+#include "ctree.h"
+
+struct extent_inode_elem;
+struct ulist;
+struct btrfs_extent_item;
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/*
* Used by implementations of iterate_extent_inodes_t (see definition below) to
@@ -247,7 +259,7 @@ struct prelim_ref {
struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
- int level;
+ u8 level;
int count;
struct extent_inode_elem *inode_list;
u64 parent;
@@ -271,22 +283,6 @@ struct btrfs_backref_iter {
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
-static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return;
- btrfs_free_path(iter->path);
- kfree(iter);
-}
-
-static inline struct extent_buffer *btrfs_backref_get_eb(
- struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return NULL;
- return iter->path->nodes[0];
-}
-
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
@@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
-static inline bool btrfs_backref_iter_is_inline_ref(
- struct btrfs_backref_iter *iter)
-{
- if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
- iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
- return true;
- return false;
-}
-
-static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
-{
- iter->bytenr = 0;
- iter->item_ptr = 0;
- iter->cur_ptr = 0;
- iter->end_ptr = 0;
- btrfs_release_path(iter->path);
- memset(&iter->cur_key, 0, sizeof(iter->cur_key));
-}
-
/*
* Backref cache related structures
*
@@ -440,102 +417,41 @@ struct btrfs_backref_cache {
* Reloction backref cache require more info for reloc root compared
* to generic backref cache.
*/
- unsigned int is_reloc;
+ bool is_reloc;
};
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc);
+ struct btrfs_backref_cache *cache, bool is_reloc);
struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_cache *cache, u64 bytenr, int level);
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1 << 0)
-#define LINK_UPPER (1 << 1)
-static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
-{
- ASSERT(upper && lower && upper->level == lower->level + 1);
- edge->node[LOWER] = lower;
- edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
-}
-
-static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
- struct btrfs_backref_node *node)
-{
- if (node) {
- ASSERT(list_empty(&node->list));
- ASSERT(list_empty(&node->lower));
- ASSERT(node->eb == NULL);
- cache->nr_nodes--;
- btrfs_put_root(node->root);
- kfree(node);
- }
-}
-
-static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
- struct btrfs_backref_edge *edge)
-{
- if (edge) {
- cache->nr_edges--;
- kfree(edge);
- }
-}
-
-static inline void btrfs_backref_unlock_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
-}
-
-static inline void btrfs_backref_drop_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->eb) {
- btrfs_backref_unlock_node_buffer(node);
- free_extent_buffer(node->eb);
- node->eb = NULL;
- }
-}
+#define LINK_LOWER (1U << 0)
+#define LINK_UPPER (1U << 1)
-/*
- * Drop the backref node from cache without cleaning up its children
- * edges.
- *
- * This can only be called on node without parent edges.
- * The children edges are still kept as is.
- */
-static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
- struct btrfs_backref_node *node)
-{
- ASSERT(list_empty(&node->upper));
-
- btrfs_backref_drop_node_buffer(node);
- list_del_init(&node->list);
- list_del_init(&node->lower);
- if (!RB_EMPTY_NODE(&node->rb_node))
- rb_erase(&node->rb_node, &tree->rb_root);
- btrfs_backref_free_node(tree, node);
-}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which);
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge);
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node);
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node);
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node);
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
- u64 bytenr, int errno)
+ u64 bytenr, int error)
{
- btrfs_panic(fs_info, errno,
+ btrfs_panic(fs_info, error,
"Inconsistency in backref cache found at offset %llu",
bytenr);
}
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 650972895652..5fd0b39d8c70 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -10,11 +10,10 @@
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
+#include "raid-stripe-tree.h"
static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
@@ -30,7 +29,7 @@ struct btrfs_failed_bio {
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
- return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
+ return bbio->inode && is_data_inode(bbio->inode);
}
static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
@@ -50,11 +49,12 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
+ WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/*
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
- * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
@@ -74,20 +74,16 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
- u64 map_length, bool use_append)
+ u64 map_length)
{
struct btrfs_bio *bbio;
struct bio *bio;
- if (use_append) {
- unsigned int nr_segs;
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
+ if (IS_ERR(bio))
+ return ERR_CAST(bio);
- bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
- &btrfs_clone_bioset, map_length);
- } else {
- bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
- GFP_NOFS, &btrfs_clone_bioset);
- }
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -124,43 +120,26 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
- __btrfs_bio_end_io(bbio);
-}
-
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
- struct btrfs_bio *orig_bbio)
-{
- /*
- * For writes we tolerate nr_mirrors - 1 write failures, so we can't
- * just blindly propagate a write failure here. Instead increment the
- * error count in the original I/O context so that it is guaranteed to
- * be larger than the error tolerance.
- */
- if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
- struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
- struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
- atomic_add(orig_bioc->max_errors, &orig_bioc->error);
- } else {
- orig_bbio->bio.bi_status = bbio->bio.bi_status;
- }
-}
-
-static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
-{
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- if (bbio->bio.bi_status)
- btrfs_bbio_propagate_error(bbio, orig_bbio);
btrfs_cleanup_bio(bbio);
bbio = orig_bbio;
}
- if (atomic_dec_and_test(&bbio->pending_ios))
+ /*
+ * At this point, bbio always points to the original btrfs_bio. Save
+ * the first error in it.
+ */
+ if (status != BLK_STS_OK)
+ cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+ if (atomic_dec_and_test(&bbio->pending_ios)) {
+ /* Load split bio's error which might be set above. */
+ if (status == BLK_STS_OK)
+ bbio->bio.bi_status = READ_ONCE(bbio->status);
__btrfs_bio_end_io(bbio);
+ }
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
@@ -180,7 +159,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
{
if (atomic_dec_and_test(&fbio->repair_count)) {
- btrfs_orig_bbio_end_io(fbio->bbio);
+ btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
mempool_free(fbio, &btrfs_failed_bio_pool);
}
}
@@ -194,6 +173,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
+ /*
+ * We can only trigger this for data bio, which doesn't support larger
+ * folios yet.
+ */
+ ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -206,7 +191,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return;
}
@@ -215,7 +200,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- bv->bv_page, bv->bv_offset, mirror);
+ page_folio(bv->bv_page), bv->bv_offset, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -275,7 +260,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return fbio;
}
@@ -321,7 +306,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
if (fbio)
btrfs_repair_done(fbio);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
@@ -355,7 +340,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
@@ -373,9 +358,9 @@ static void btrfs_simple_end_io(struct bio *bio)
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
}
@@ -389,7 +374,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
btrfs_check_read_bio(bbio, NULL);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -416,7 +401,10 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- btrfs_orig_bbio_end_io(bbio);
+ if (bio_is_zone_append(bio) && !bio->bi_status)
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -427,6 +415,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
+ } else if (bio_is_zone_append(bio)) {
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
/* Pass on control to the original bio this one was cloned from */
@@ -463,8 +453,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
- btrfsic_check_bio(bio);
-
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -490,11 +478,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
+ bioc->size = bio->bi_iter.bi_size;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
- struct btrfs_io_stripe *smap, int mirror_num)
+static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
{
if (!bioc) {
/* Single mirror read/write fast path. */
@@ -568,16 +557,23 @@ static void run_one_async_start(struct btrfs_work *work)
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the tree.
+ *
+ * If called with @do_free == true, then it will free the work struct.
*/
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work *work, bool do_free)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
struct bio *bio = &async->bbio->bio;
+ if (do_free) {
+ kfree(container_of(work, struct async_submit_bio, work));
+ return;
+ }
+
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_orig_bbio_end_io(async->bbio);
+ btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
return;
}
@@ -587,18 +583,25 @@ static void run_one_async_done(struct btrfs_work *work)
* context. This changes nothing when cgroups aren't in use.
*/
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
- __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
-}
-
-static void run_one_async_free(struct btrfs_work *work)
-{
- kfree(container_of(work, struct async_submit_bio, work));
+ btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
static bool should_async_write(struct btrfs_bio *bbio)
{
+ bool auto_csum_mode = true;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+ enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
+
+ if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
+ return false;
+
+ auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+#endif
+
/* Submit synchronously if the checksum implementation is fast. */
- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
return false;
/*
@@ -618,7 +621,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
/*
* Submit bio to an async queue.
*
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
*/
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
@@ -636,12 +639,30 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
async->smap = *smap;
async->mirror_num = mirror_num;
- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
- run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
btrfs_queue_work(fs_info->workers, &async->work);
return true;
}
+static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
+{
+ unsigned int nr_segs;
+ int sector_offset;
+
+ map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ &nr_segs, map_length);
+ if (sector_offset) {
+ /*
+ * bio_split_rw_at() could split at a size smaller than our
+ * sectorsize and thus cause unaligned I/Os. Fix that by
+ * always rounding down to the nearest boundary.
+ */
+ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
+ }
+ return map_length;
+}
+
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
@@ -656,20 +677,34 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
blk_status_t ret;
int error;
+ if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
+ smap.rst_search_commit_root = true;
+ else
+ smap.rst_search_commit_root = false;
+
btrfs_bio_counter_inc_blocked(fs_info);
error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
+ &bioc, &smap, &mirror_num);
if (error) {
ret = errno_to_blk_status(error);
- goto fail;
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
}
map_length = min(map_length, length);
if (use_append)
- map_length = min(map_length, fs_info->max_zone_append_size);
+ map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ struct btrfs_bio *split;
+
+ split = btrfs_split_bio(fs_info, bbio, map_length);
+ if (IS_ERR(split)) {
+ ret = errno_to_blk_status(PTR_ERR(split));
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
+ }
+ bbio = split;
bio = &bbio->bio;
}
@@ -690,12 +725,24 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
+ if (is_data_bbio(bbio) && bioc &&
+ btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ /*
+ * No locking for the list update, as we only add to
+ * the list in the I/O submission path, and list
+ * iteration only happens in the completion path, which
+ * can't happen until after the last submission.
+ */
+ btrfs_get_bioc(bioc);
+ list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+ }
+
/*
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
*/
if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
@@ -713,7 +760,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
}
}
- __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
+ btrfs_submit_bio(bio, bioc, &smap, mirror_num);
done:
return map_length == length;
@@ -729,16 +776,15 @@ fail:
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
ASSERT(remaining);
- remaining->bio.bi_status = ret;
- btrfs_orig_bbio_end_io(remaining);
+ btrfs_bio_end_io(remaining, ret);
}
- bbio->bio.bi_status = ret;
- btrfs_orig_bbio_end_io(bbio);
+end_bbio:
+ btrfs_bio_end_io(bbio, ret);
/* Do not submit another chunk */
return true;
}
-void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
+void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
@@ -750,7 +796,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
/*
* Submit a repair write.
*
- * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
@@ -758,8 +804,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -790,9 +836,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, length, pg_offset);
-
- btrfsic_check_bio(&bio);
+ ret = bio_add_folio(&bio, folio, length, folio_offset);
+ ASSERT(ret);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
@@ -840,7 +885,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
smap.dev = fs_info->dev_replace.tgtdev;
}
- __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+ btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
return;
fail:
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..e2fe16074ad6 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -7,12 +7,14 @@
#ifndef BTRFS_BIO_H
#define BTRFS_BIO_H
+#include <linux/types.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include "tree-checker.h"
struct btrfs_bio;
struct btrfs_fs_info;
+struct btrfs_inode;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -27,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
* Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and
- * passed to btrfs_submit_bio for mapping to the physical devices.
+ * passed to btrfs_submit_bbio() for mapping to the physical devices.
*/
struct btrfs_bio {
/*
@@ -40,7 +42,7 @@ struct btrfs_bio {
union {
/*
* For data reads: checksumming and original I/O information.
- * (for internal use in the btrfs_submit_bio machinery only)
+ * (for internal use in the btrfs_submit_bbio() machinery only)
*/
struct {
u8 *csum;
@@ -77,6 +79,9 @@ struct btrfs_bio {
/* File system that this I/O operates on. */
struct btrfs_fs_info *fs_info;
+ /* Save the first error status of split bio. */
+ blk_status_t status;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -102,10 +107,10 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
/* Submit using blkcg_punt_bio_submit. */
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
-void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 434cf3d5f4cf..7eef79ece5b3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -23,7 +23,7 @@
#include "extent-tree.h"
#ifdef CONFIG_BTRFS_DEBUG
-int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
+int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -34,15 +34,28 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
*
* Should be called with balance_lock held
*/
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
{
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
if (!bctl)
@@ -168,7 +181,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);
kfree(cache->free_space_ctl);
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
@@ -418,7 +431,7 @@ struct btrfs_caching_control *btrfs_get_caching_control(
return ctl;
}
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
@@ -935,7 +948,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
caching_ctl->block_group = cache;
refcount_set(&caching_ctl->count, 2);
atomic_set(&caching_ctl->progress, 0);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
@@ -1022,6 +1035,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
+static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *block_group)
@@ -1047,7 +1067,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
@@ -1059,11 +1079,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!block_group);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
+ if (!block_group)
+ return -ENOENT;
+
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group);
@@ -1240,6 +1262,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1252,7 +1283,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1268,25 +1299,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
btrfs_free_path(path);
return ret;
}
@@ -1295,15 +1318,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1344,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
@@ -1418,9 +1437,9 @@ out:
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *bg)
+ const struct btrfs_block_group *bg)
{
- struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
@@ -1442,7 +1461,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
- * it, leading to a BUG_ON() at unpin_extent_range().
+ * it, leading to an error at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
@@ -1535,6 +1554,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
+ *
+ * Also bail out if this is the only block group for its
+ * type, because otherwise we would lose profile
+ * information from fs_info->avail_*_alloc_bits and the
+ * next block group of this type would be created with a
+ * "single" profile (even if we're in a raid fs) because
+ * fs_info->avail_*_alloc_bits would be 0.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1563,8 +1589,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1752,33 +1779,30 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
-static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
{
if (btrfs_is_zoned(fs_info))
return btrfs_zoned_should_reclaim(fs_info);
return true;
}
-static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
+static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
{
- const struct btrfs_space_info *space_info = bg->space_info;
- const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
+ u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
- u64 thresh;
- if (reclaim_thresh == 0)
+ if (thresh_bytes == 0)
return false;
- thresh = mult_perc(bg->length, reclaim_thresh);
-
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
- if (old_val < thresh)
+ if (old_val < thresh_bytes)
return false;
- if (new_val >= thresh)
+ if (new_val >= thresh_bytes)
return false;
return true;
}
@@ -1826,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
+ u64 reclaimed;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1839,6 +1864,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+ spin_lock(&space_info->lock);
spin_lock(&bg->lock);
if (bg->reserved || bg->pinned || bg->ro) {
/*
@@ -1848,6 +1874,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
* this block group.
*/
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
@@ -1866,6 +1893,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_mark_bg_unused(bg);
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
@@ -1882,10 +1910,23 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!should_reclaim_block_group(bg, bg->length)) {
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the block group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore. We also
+ * cache it before unlocking the block group, to prevent races
+ * (reports from KCSAN and such tools) with tasks updating it.
+ */
+ zone_unusable = bg->zone_unusable;
+
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
/*
* Get out fast, in case we're read-only or unmounting the
@@ -1900,13 +1941,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
@@ -1918,15 +1952,26 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(bg->used * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
+ reclaimed = bg->used;
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ reclaimed = 0;
+ spin_lock(&space_info->lock);
+ space_info->reclaim_errors++;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ space_info->periodic_reclaim_ready = false;
+ spin_unlock(&space_info->lock);
}
+ spin_lock(&space_info->lock);
+ space_info->reclaim_count++;
+ space_info->reclaim_bytes += reclaimed;
+ spin_unlock(&space_info->lock);
next:
- if (ret) {
+ if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
/* Refcount held by the reclaim_bgs list after splice. */
spin_lock(&fs_info->unused_bgs_lock);
/*
@@ -1968,6 +2013,7 @@ end:
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
+ btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
@@ -1987,11 +2033,10 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
-static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
- struct btrfs_path *path)
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
+ const struct btrfs_path *path)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
@@ -2001,23 +2046,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
- read_unlock(&em_tree->lock);
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
- if (em->start != key->objectid || em->len != key->offset) {
+ if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
- key->objectid, key->offset, em->start, em->len);
+ key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
- goto out_free_em;
+ goto out_free_map;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -2025,22 +2067,22 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
-out_free_em:
- free_extent_map(em);
+out_free_map:
+ btrfs_free_chunk_map(map);
return ret;
}
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
@@ -2087,8 +2129,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -2096,14 +2137,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->orig_block_len;
+ data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
- chunk_start = em->start;
+ chunk_start = map->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2157,7 +2197,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2262,49 +2302,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
+ bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
+ if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
@@ -2432,28 +2470,25 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- struct extent_map *em;
- struct map_lookup *map;
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- em = rb_entry(node, struct extent_map, rb_node);
- map = em->map_lookup;
- bg = btrfs_create_block_group_cache(fs_info, em->start);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
- bg->length = em->len;
+ bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
- bg->used = em->len;
+ bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
@@ -2632,8 +2667,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
}
static int insert_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 chunk_offset,
- u64 start, u64 num_bytes)
+ const struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
@@ -2681,19 +2716,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_offset;
- u64 stripe_size;
int i;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2710,13 +2740,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
- stripe_size);
+ map->stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2772,9 +2802,43 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2783,7 +2847,7 @@ next:
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
*/
-static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
{
u64 div = SZ_1G;
u64 index;
@@ -2882,8 +2946,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
- trans->delayed_ref_updates++;
- btrfs_update_delayed_refs_rsv(trans);
+ btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
return cache;
@@ -2974,7 +3037,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
goto unlock_out;
/*
- * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
* chunk allocation storm to exhaust the system chunk array. Otherwise
* we still want to try our best to mark the block group read-only.
*/
@@ -3116,7 +3179,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
@@ -3168,7 +3230,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -3435,7 +3497,7 @@ again:
if (should_put)
btrfs_put_block_group(cache);
if (drop_reserve)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
@@ -3539,8 +3601,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans,
- (unsigned long) -1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
@@ -3583,7 +3644,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
/* If its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -3608,12 +3669,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group *cache = NULL;
- u64 total = num_bytes;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group *cache;
u64 old_val;
- u64 byte_in_group;
+ bool reclaim = false;
+ bool bg_already_dirty = true;
int factor;
- int ret = 0;
/* Block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -3625,97 +3686,91 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_root_lock);
- while (total) {
- struct btrfs_space_info *space_info;
- bool reclaim = false;
-
- cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
- space_info = cache->space_info;
- factor = btrfs_bg_type_to_factor(cache->flags);
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -ENOENT;
- /*
- * If this block group has free space cache written out, we
- * need to make sure to load it if we are removing space. This
- * is because we need the unpinning stage to actually add the
- * space back to the block group, otherwise we will leak space.
- */
- if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, true);
+ /* An extent can not span multiple block groups. */
+ ASSERT(bytenr + num_bytes <= cache->start + cache->length);
- byte_in_group = bytenr - cache->start;
- WARN_ON(byte_in_group > cache->length);
+ space_info = cache->space_info;
+ factor = btrfs_bg_type_to_factor(cache->flags);
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
+ /*
+ * If this block group has free space cache written out, we need to make
+ * sure to load it if we are removing space. This is because we need
+ * the unpinning stage to actually add the space back to the block group,
+ * otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, true);
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- cache->disk_cache_state < BTRFS_DC_CLEAR)
- cache->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
- old_val = cache->used;
- num_bytes = min(total, cache->length - byte_in_group);
- if (alloc) {
- old_val += num_bytes;
- cache->used = old_val;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->bytes_used += num_bytes;
- space_info->disk_used += num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- } else {
- old_val -= num_bytes;
- cache->used = old_val;
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info, space_info,
- num_bytes);
- space_info->bytes_used -= num_bytes;
- space_info->disk_used -= num_bytes * factor;
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ old_val = cache->used;
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ cache->reclaim_mark = 0;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, -num_bytes);
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, num_bytes);
+ else
reclaim = should_reclaim_block_group(cache, num_bytes);
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
-
- set_extent_bit(&trans->transaction->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_DIRTY, NULL);
- }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- trans->delayed_ref_updates++;
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
+ set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ }
- /*
- * No longer have used bytes in this block group, queue it for
- * deletion. We do this after adding the block group to the
- * dirty list to avoid races between cleaner kthread and space
- * cache writeout.
- */
- if (!alloc && old_val == 0) {
- if (!btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_mark_bg_unused(cache);
- } else if (!alloc && reclaim) {
- btrfs_mark_bg_to_reclaim(cache);
- }
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_put_block_group(cache);
- total -= num_bytes;
- bytenr += num_bytes;
+ /*
+ * No longer have used bytes in this block group, queue it for deletion.
+ * We do this after adding the block group to the dirty list to avoid
+ * races between cleaner kthread and space cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
+ btrfs_put_block_group(cache);
+
/* Modified block groups are accounted for in the delayed_refs_rsv. */
- btrfs_update_delayed_refs_rsv(trans);
- return ret;
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+ return 0;
}
/*
@@ -3819,8 +3874,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *sinfo, int force)
+static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4195,7 +4250,7 @@ out:
return ret;
}
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
@@ -4336,13 +4391,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
spin_lock(&block_group->lock);
if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
&block_group->runtime_flags)) {
- struct inode *inode = block_group->inode;
+ struct btrfs_inode *inode = block_group->inode;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
+ iput(&inode->vfs_inode);
} else {
spin_unlock(&block_group->lock);
}
@@ -4487,8 +4542,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
@@ -4497,17 +4550,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
+
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
@@ -4602,7 +4654,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
return 0;
}
-bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
+bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
{
if (btrfs_is_zoned(bg->fs_info))
return false;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 089979981e4a..36937eeab9b8 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -3,8 +3,23 @@
#ifndef BTRFS_BLOCK_GROUP_H
#define BTRFS_BLOCK_GROUP_H
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/wait.h>
+#include <linux/sizes.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs_tree.h>
#include "free-space-cache.h"
+struct btrfs_chunk_map;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_trans_handle;
+
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
@@ -100,7 +115,7 @@ struct btrfs_caching_control {
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
spinlock_t lock;
u64 start;
u64 length;
@@ -243,14 +258,15 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
- struct map_lookup *physical_map;
+ struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
enum btrfs_block_group_size_class size_class;
+ u64 reclaim_mark;
};
-static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
{
return (block_group->start + block_group->length);
}
@@ -262,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}
-static inline bool btrfs_is_block_group_data_only(
- struct btrfs_block_group *block_group)
+static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
{
/*
* In mixed mode the fragmentation is expected to be high, lowering the
@@ -274,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
-int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
+int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
@@ -295,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
@@ -304,7 +318,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
+ struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
@@ -355,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
-static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
+static inline int btrfs_block_group_done(const struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
@@ -372,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class);
-bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
+bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index db8da4e7b228..a07b9594dc70 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,7 +6,6 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
-#include "disk-io.h"
#include "fs.h"
#include "accessors.h"
@@ -221,7 +220,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -261,7 +261,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -279,10 +280,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *target = NULL;
/*
- * If we are the delayed_rsv then push to the global rsv, otherwise dump
- * into the delayed rsv if it is not full.
+ * If we are a delayed block reserve then push to the global rsv,
+ * otherwise dump into the global delayed reserve if it is not full.
*/
- if (block_rsv == delayed_rsv)
+ if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
target = global_rsv;
else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
@@ -340,9 +341,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
read_lock(&fs_info->global_root_lock);
rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
rb_node) {
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
num_bytes += btrfs_root_used(&root->root_item);
min_items++;
}
@@ -354,6 +355,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
min_items++;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
+ min_items++;
+ }
+
/*
* But we also want to reserve enough space so we can do the fallback
* global reserve for an unlink, which is an additional
@@ -400,11 +406,12 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- switch (root->root_key.objectid) {
+ switch (btrfs_root_id(root)) {
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
@@ -461,8 +468,7 @@ static struct btrfs_block_rsv *get_block_rsv(
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
(root == fs_info->uuid_root) ||
- (trans->adding_csums &&
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
+ (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -517,8 +523,8 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ blocksize, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
@@ -539,7 +545,7 @@ try_reserve:
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
@@ -547,7 +553,7 @@ try_reserve:
return ERR_PTR(ret);
}
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
u64 needed_bytes;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 43a9a6b5a79f..d12b1fac5c74 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_BLOCK_RSV_H
#define BTRFS_BLOCK_RSV_H
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+
struct btrfs_trans_handle;
struct btrfs_root;
+struct btrfs_space_info;
+struct btrfs_block_rsv;
+struct btrfs_fs_info;
enum btrfs_reserve_flush_enum;
/*
@@ -82,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ec6679a538c1..db53a3263fbd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,11 +8,31 @@
#include <linux/hash.h>
#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/fscrypt.h>
+#include <linux/lockdep.h>
+#include <uapi/linux/btrfs_tree.h>
+#include <trace/events/btrfs.h>
+#include "block-rsv.h"
#include "extent_map.h"
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "ordered-data.h"
#include "delayed-inode.h"
+struct extent_state;
+struct posix_acl;
+struct iov_iter;
+struct writeback_control;
+struct btrfs_root;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
/*
* Since we search a directory based on f_pos (struct dir_context::pos) we have
* to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
@@ -39,7 +59,6 @@ enum {
*/
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
- BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
/*
@@ -67,6 +86,41 @@ enum {
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
+ /* Set when there are no capabilities in XATTs for the inode. */
+ BTRFS_INODE_NO_CAP_XATTR,
+ /*
+ * Set if an error happened when doing a COW write before submitting a
+ * bio or during writeback. Used for both buffered writes and direct IO
+ * writes. This is to signal a fast fsync that it has to wait for
+ * ordered extents to complete and therefore not log extent maps that
+ * point to unwritten extents (when an ordered extent completes and it
+ * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
+ * range).
+ */
+ BTRFS_INODE_COW_WRITE_ERROR,
+ /*
+ * Indicate this is a directory that points to a subvolume for which
+ * there is no root reference item. That's a case like the following:
+ *
+ * $ btrfs subvolume create /mnt/parent
+ * $ btrfs subvolume create /mnt/parent/child
+ * $ btrfs subvolume snapshot /mnt/parent /mnt/snap
+ *
+ * If subvolume "parent" is root 256, subvolume "child" is root 257 and
+ * snapshot "snap" is root 258, then there's no root reference item (key
+ * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child"
+ * associated to root 258 (the snapshot) - there's only for the root
+ * of the "parent" subvolume (root 256). In the chunk root we have a
+ * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a
+ * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we
+ * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a
+ * (257 BTRFS_ROOT_BACKREF_KEY 258) key.
+ *
+ * So when opening the "child" dentry from the snapshot's directory,
+ * we don't find a root ref item and we create a stub inode. This is
+ * done at new_simple_dir(), called from btrfs_lookup_dentry().
+ */
+ BTRFS_INODE_ROOT_STUB,
};
/* in memory btrfs inode */
@@ -74,10 +128,23 @@ struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
- /* key used to find this inode on disk. This is used by the code
- * to read in roots of subvolumes
+#if BITS_PER_LONG == 32
+ /*
+ * The objectid of the corresponding BTRFS_INODE_ITEM_KEY.
+ * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an
+ * unsigned long and therefore 64 bits on such platforms.
+ */
+ u64 objectid;
+#endif
+
+ /* Cached value of inode property 'compression'. */
+ u8 prop_compress;
+
+ /*
+ * Force compression on the file using the defrag ioctl, could be
+ * different from prop_compress and takes precedence if set.
*/
- struct btrfs_key location;
+ u8 defrag_compress;
/*
* Lock for counters and all fields used to determine if the inode is in
@@ -97,9 +164,11 @@ struct btrfs_inode {
/*
* Keep track of where the inode has extent items mapped in order to
- * make sure the i_size adjustments are accurate
+ * make sure the i_size adjustments are accurate. Not required when the
+ * filesystem is NO_HOLES, the status can't be set while mounted as
+ * it's a mkfs-time feature.
*/
- struct extent_io_tree file_extent_tree;
+ struct extent_io_tree *file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -113,7 +182,9 @@ struct btrfs_inode {
unsigned outstanding_extents;
/* used to order data wrt metadata */
- struct btrfs_ordered_inode_tree ordered_tree;
+ spinlock_t ordered_tree_lock;
+ struct rb_root ordered_tree;
+ struct rb_node *ordered_tree_last;
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
@@ -121,9 +192,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /* node for the red-black tree that links inodes in subvolume root */
- struct rb_node rb_node;
-
unsigned long runtime_flags;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -184,11 +252,20 @@ struct btrfs_inode {
u64 last_dir_index_offset;
};
- /*
- * Total number of bytes pending defrag, used by stat to check whether
- * it needs COW. Protected by 'lock'.
- */
- u64 defrag_bytes;
+ union {
+ /*
+ * Total number of bytes pending defrag, used by stat to check whether
+ * it needs COW. Protected by 'lock'.
+ * Used by inodes other than the data relocation inode.
+ */
+ u64 defrag_bytes;
+
+ /*
+ * Logical address of the block group being relocated.
+ * Used only by the data relocation inode.
+ */
+ u64 reloc_block_group_start;
+ };
/*
* The size of the file stored in the metadata on disk. data=ordered
@@ -197,12 +274,21 @@ struct btrfs_inode {
*/
u64 disk_i_size;
- /*
- * If this is a directory then index_cnt is the counter for the index
- * number for new files that are created. For an empty directory, this
- * must be initialized to BTRFS_DIR_START_INDEX.
- */
- u64 index_cnt;
+ union {
+ /*
+ * If this is a directory then index_cnt is the counter for the
+ * index number for new files that are created. For an empty
+ * directory, this must be initialized to BTRFS_DIR_START_INDEX.
+ */
+ u64 index_cnt;
+
+ /*
+ * If this is not a directory, this is the number of bytes
+ * outstanding that are going to need csums. This is used in
+ * ENOSPC accounting. Protected by 'lock'.
+ */
+ u64 csum_bytes;
+ };
/* Cache the directory index number to speed the dir/file remove */
u64 dir_index;
@@ -214,22 +300,25 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
- /*
- * The id/generation of the last transaction where this inode was
- * either the source or the destination of a clone/dedupe operation.
- * Used when logging an inode to know if there are shared extents that
- * need special care when logging checksum items, to avoid duplicate
- * checksum items in a log (which can lead to a corruption where we end
- * up with missing checksum ranges after log replay).
- * Protected by the vfs inode lock.
- */
- u64 last_reflink_trans;
+ union {
+ /*
+ * The id/generation of the last transaction where this inode
+ * was either the source or the destination of a clone/dedupe
+ * operation. Used when logging an inode to know if there are
+ * shared extents that need special care when logging checksum
+ * items, to avoid duplicate checksum items in a log (which can
+ * lead to a corruption where we end up with missing checksum
+ * ranges after log replay). Protected by the VFS inode lock.
+ * Used for regular files only.
+ */
+ u64 last_reflink_trans;
- /*
- * Number of bytes outstanding that are going to need csums. This is
- * used in ENOSPC accounting. Protected by 'lock'.
- */
- u64 csum_bytes;
+ /*
+ * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set),
+ * the ID of that root.
+ */
+ u64 ref_root_id;
+ };
/* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
@@ -238,20 +327,11 @@ struct btrfs_inode {
struct btrfs_block_rsv block_rsv;
- /*
- * Cached values of inode properties
- */
- unsigned prop_compress; /* per-file compression algorithm */
- /*
- * Force compression on the file using the defrag ioctl, could be
- * different from prop_compress and takes precedence if set
- */
- unsigned defrag_compress;
-
struct btrfs_delayed_node *delayed_node;
/* File creation time. */
- struct timespec64 i_otime;
+ u64 i_otime_sec;
+ u32 i_otime_nsec;
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
@@ -271,10 +351,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
WRITE_ONCE(inode->first_dir_index_to_log, index);
}
-static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
-{
- return container_of(inode, struct btrfs_inode, vfs_inode);
-}
+/* Type checked and const-preserving VFS inode -> btrfs inode. */
+#define BTRFS_I(_inode) \
+ _Generic(_inode, \
+ struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \
+ const struct inode *: (const struct btrfs_inode *)container_of( \
+ _inode, const struct btrfs_inode, vfs_inode))
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
@@ -296,10 +378,9 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
*/
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
- u64 ino = inode->location.objectid;
+ u64 ino = inode->objectid;
- /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
- if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags))
ino = inode->vfs_inode.i_ino;
return ino;
}
@@ -313,20 +394,36 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode)
#endif
+static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
+ struct btrfs_key *key)
+{
+ key->objectid = btrfs_ino(inode);
+ key->type = BTRFS_INODE_ITEM_KEY;
+ key->offset = 0;
+}
+
+static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino)
+{
+#if BITS_PER_LONG == 32
+ inode->objectid = ino;
+#endif
+ inode->vfs_inode.i_ino = ino;
+}
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
inode->disk_i_size = size;
}
-static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
+static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode)
{
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
-static inline bool is_data_inode(struct inode *inode)
+static inline bool is_data_inode(const struct btrfs_inode *inode)
{
- return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+ return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID;
}
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
@@ -356,9 +453,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
}
/*
- * Should be called while holding the inode's VFS lock in exclusive mode or in a
- * context where no one else can access the inode concurrently (during inode
- * creation or when loading an inode from disk).
+ * Should be called while holding the inode's VFS lock in exclusive mode, or
+ * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in
+ * either shared or exclusive mode, or in a context where no one else can access
+ * the inode concurrently (during inode creation or when loading an inode from
+ * disk).
*/
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
@@ -392,7 +491,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit)
+ inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
ret = true;
spin_unlock(&inode->lock);
return ret;
@@ -409,6 +508,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
+static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
+{
+ /* Immediately trigger a crash if the inode is not locked. */
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ /* Trigger a splat in dmesg if this task is not holding the lock. */
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
@@ -418,10 +525,10 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict);
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict);
-void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -471,7 +578,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -479,16 +585,15 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end);
+ struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
@@ -502,24 +607,27 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page);
+int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
+ u64 disk_bytenr, u64 disk_io_size,
struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size);
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
extern const struct dentry_operations btrfs_dentry_operations;
@@ -535,5 +643,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes);
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type);
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
deleted file mode 100644
index 3caf339c4bb3..000000000000
--- a/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,2871 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- * currently referenced by the super block (either directly
- * or indirectly).
- * 2. When a super block is written, it is verified that all
- * referenced (directly or indirectly) blocks fulfill the
- * following requirements:
- * 2a. All referenced blocks have either been present when
- * the file system was mounted, (i.e., they have been
- * referenced by the super block) or they have been
- * written since then and the write completion callback
- * was called and no write error was indicated and a
- * FLUSH request to the device where these blocks are
- * located was received and completed.
- * 2b. All referenced blocks need to have a generation
- * number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- *
- * Expect millions of lines of information in the kernel log with an
- * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
- * kernel config to at least 26 (which is 64MB). Usually the value is
- * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
- * changed like this before LOG_BUF_SHIFT can be set to a high value:
- * config LOG_BUF_SHIFT
- * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- * range 12 30
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <crypto/hash.h>
-#include "messages.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
-#include "compression.h"
-#include "accessors.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
- * excluding " [...]" */
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
- u32 magic_num; /* only used for debug purposes */
- unsigned int is_metadata:1; /* if it is meta-data, not data-data */
- unsigned int is_superblock:1; /* if it is one of the superblocks */
- unsigned int is_iodone:1; /* if is done by lower subsystem */
- unsigned int iodone_w_error:1; /* error was indicated to endio */
- unsigned int never_written:1; /* block was added because it was
- * referenced, not because it was
- * written */
- unsigned int mirror_num; /* large enough to hold
- * BTRFS_SUPER_MIRROR_MAX */
- struct btrfsic_dev_state *dev_state;
- u64 dev_bytenr; /* key, physical byte num on disk */
- u64 logical_bytenr; /* logical byte num on disk */
- u64 generation;
- struct btrfs_disk_key disk_key; /* extra info to print in case of
- * issues, will not always be correct */
- struct list_head collision_resolving_node; /* list node */
- struct list_head all_blocks_node; /* list node */
-
- /* the following two lists contain block_link items */
- struct list_head ref_to_list; /* list */
- struct list_head ref_from_list; /* list */
- struct btrfsic_block *next_in_same_bio;
- void *orig_bio_private;
- bio_end_io_t *orig_bio_end_io;
- blk_opf_t submit_bio_bh_rw;
- u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block referred from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
- u32 magic_num; /* only used for debug purposes */
- u32 ref_cnt;
- struct list_head node_ref_to; /* list node */
- struct list_head node_ref_from; /* list node */
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block *block_ref_to;
- struct btrfsic_block *block_ref_from;
- u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
- u32 magic_num; /* only used for debug purposes */
- struct block_device *bdev;
- struct btrfsic_state *state;
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block dummy_block_for_bio_bh_flush;
- u64 last_flush_gen;
-};
-
-struct btrfsic_block_hashtable {
- struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
- struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
- struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
- u64 start; /* virtual bytenr */
- u64 dev_bytenr; /* physical bytenr on device */
- u32 len;
- struct btrfsic_dev_state *dev;
- char **datav;
- struct page **pagev;
- void *mem_to_free;
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
- u32 magic;
- u32 nr;
- int error;
- int i;
- int limit_nesting;
- int num_copies;
- int mirror_num;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx *block_ctx;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfs_header *hdr;
- struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
- u32 print_mask;
- int include_extent_data;
- struct list_head all_blocks_list;
- struct btrfsic_block_hashtable block_hashtable;
- struct btrfsic_block_link_hashtable block_link_hashtable;
- struct btrfs_fs_info *fs_info;
- u64 max_superblock_generation;
- struct btrfsic_block *latest_superblock;
- u32 metablock_size;
- u32 datablock_size;
-};
-
-static int btrfsic_process_metablock(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- int limit_nesting, int force_iodone_flag);
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dst, u32 offset, size_t len);
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx
- *block_ctx, u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const block,
- struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
- b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
- b->dev_state = NULL;
- b->dev_bytenr = 0;
- b->logical_bytenr = 0;
- b->generation = BTRFSIC_GENERATION_UNKNOWN;
- b->disk_key.objectid = 0;
- b->disk_key.type = 0;
- b->disk_key.offset = 0;
- b->is_metadata = 0;
- b->is_superblock = 0;
- b->is_iodone = 0;
- b->iodone_w_error = 0;
- b->never_written = 0;
- b->mirror_num = 0;
- b->next_in_same_bio = NULL;
- b->orig_bio_private = NULL;
- b->orig_bio_end_io = NULL;
- INIT_LIST_HEAD(&b->collision_resolving_node);
- INIT_LIST_HEAD(&b->all_blocks_node);
- INIT_LIST_HEAD(&b->ref_to_list);
- INIT_LIST_HEAD(&b->ref_from_list);
- b->submit_bio_bh_rw = 0;
- b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
- struct btrfsic_block *b;
-
- b = kzalloc(sizeof(*b), GFP_NOFS);
- if (NULL != b)
- btrfsic_block_init(b);
-
- return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
- BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
- kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
- l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
- l->ref_cnt = 1;
- INIT_LIST_HEAD(&l->node_ref_to);
- INIT_LIST_HEAD(&l->node_ref_from);
- INIT_LIST_HEAD(&l->collision_resolving_node);
- l->block_ref_to = NULL;
- l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
- struct btrfsic_block_link *l;
-
- l = kzalloc(sizeof(*l), GFP_NOFS);
- if (NULL != l)
- btrfsic_block_link_init(l);
-
- return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
- BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
- kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
- ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
- ds->bdev = NULL;
- ds->state = NULL;
- INIT_LIST_HEAD(&ds->collision_resolving_node);
- ds->last_flush_gen = 0;
- btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
- ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
- ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
- struct btrfsic_dev_state *ds;
-
- ds = kzalloc(sizeof(*ds), GFP_NOFS);
- if (NULL != ds)
- btrfsic_dev_state_init(ds);
-
- return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
- BUG_ON(!(NULL == ds ||
- BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
- kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(b->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
- list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
- list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct btrfsic_block *b;
-
- list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
- if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
- return b;
- }
-
- return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
- ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
- ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
- & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
- list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
- ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
- ((unsigned int)((uintptr_t)bdev_ref_to)) ^
- ((unsigned int)((uintptr_t)bdev_ref_from))) &
- (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct btrfsic_block_link *l;
-
- list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
- l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
- l->block_ref_from->dev_state->bdev == bdev_ref_from &&
- l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
- return l;
- }
-
- return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
- (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
- list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
- list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
- struct btrfsic_dev_state *ds;
-
- list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
- if (ds->bdev->bd_dev == dev)
- return ds;
- }
-
- return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices)
-{
- struct btrfs_super_block *selected_super;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
- struct btrfsic_dev_state *selected_dev_state = NULL;
- int ret = 0;
- int pass;
-
- selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (!selected_super)
- return -ENOMEM;
-
- list_for_each_entry(device, dev_head, dev_list) {
- int i;
- struct btrfsic_dev_state *dev_state;
-
- if (!device->bdev || !device->name)
- continue;
-
- dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
- BUG_ON(NULL == dev_state);
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- ret = btrfsic_process_superblock_dev_mirror(
- state, dev_state, device, i,
- &selected_dev_state, selected_super);
- if (0 != ret && 0 == i) {
- kfree(selected_super);
- return ret;
- }
- }
- }
-
- if (NULL == state->latest_superblock) {
- pr_info("btrfsic: no superblock found!\n");
- kfree(selected_super);
- return -1;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int num_copies;
- int mirror_num;
- u64 next_bytenr;
-
- switch (pass) {
- case 0:
- next_bytenr = btrfs_super_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- next_bytenr = btrfs_super_chunk_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- next_bytenr = btrfs_super_log_root(selected_super);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- kfree(selected_super);
- return -1;
- }
-
- next_block = btrfsic_block_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- &state->block_hashtable);
- BUG_ON(NULL == next_block);
-
- l = btrfsic_block_link_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- state->latest_superblock->dev_state->
- bdev,
- state->latest_superblock->dev_bytenr,
- &state->block_link_hashtable);
- BUG_ON(NULL == l);
-
- ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)PAGE_SIZE) {
- pr_info("btrfsic: read @logical %llu failed!\n",
- tmp_next_block_ctx.start);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- kfree(selected_super);
- return -1;
- }
-
- ret = btrfsic_process_metablock(state,
- next_block,
- &tmp_next_block_ctx,
- BTRFS_MAX_LEVEL + 3, 1);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- }
- }
-
- kfree(selected_super);
- return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_super_block *super_tmp;
- u64 dev_bytenr;
- struct btrfsic_block *superblock_tmp;
- int pass;
- struct block_device *const superblock_bdev = device->bdev;
- struct page *page;
- struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
- int ret = 0;
-
- /* super block bytenr is always the unmapped device bytenr */
- dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
- return -1;
-
- page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return -1;
-
- super_tmp = page_address(page);
-
- if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
- btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
- memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
- btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- ret = 0;
- goto out;
- }
-
- superblock_tmp =
- btrfsic_block_hashtable_lookup(superblock_bdev,
- dev_bytenr,
- &state->block_hashtable);
- if (NULL == superblock_tmp) {
- superblock_tmp = btrfsic_block_alloc();
- if (NULL == superblock_tmp) {
- ret = -1;
- goto out;
- }
- /* for superblock, only the dev_bytenr makes sense */
- superblock_tmp->dev_bytenr = dev_bytenr;
- superblock_tmp->dev_state = dev_state;
- superblock_tmp->logical_bytenr = dev_bytenr;
- superblock_tmp->generation = btrfs_super_generation(super_tmp);
- superblock_tmp->is_metadata = 1;
- superblock_tmp->is_superblock = 1;
- superblock_tmp->is_iodone = 1;
- superblock_tmp->never_written = 0;
- superblock_tmp->mirror_num = 1 + superblock_mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
- superblock_bdev,
- btrfs_dev_name(device), dev_bytenr,
- dev_state->bdev, dev_bytenr,
- superblock_mirror_num);
- list_add(&superblock_tmp->all_blocks_node,
- &state->all_blocks_list);
- btrfsic_block_hashtable_add(superblock_tmp,
- &state->block_hashtable);
- }
-
- /* select the one with the highest generation field */
- if (btrfs_super_generation(super_tmp) >
- state->max_superblock_generation ||
- 0 == state->max_superblock_generation) {
- memcpy(selected_super, super_tmp, sizeof(*selected_super));
- *selected_dev_state = dev_state;
- state->max_superblock_generation =
- btrfs_super_generation(super_tmp);
- state->latest_superblock = superblock_tmp;
- }
-
- for (pass = 0; pass < 3; pass++) {
- u64 next_bytenr;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key;
-
- tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_disk_key.offset = 0;
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "initial root ";
- next_bytenr = btrfs_super_root(super_tmp);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "initial chunk ";
- next_bytenr = btrfs_super_chunk_root(super_tmp);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "initial log ";
- next_bytenr = btrfs_super_log_root(super_tmp);
- if (0 == next_bytenr)
- continue;
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- if (btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num)) {
- pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- ret = -1;
- goto out;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state, &tmp_next_block_ctx,
- additional_string, 1, 1, 0,
- mirror_num, NULL);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- ret = -1;
- goto out;
- }
-
- next_block->disk_key = tmp_disk_key;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state, &tmp_next_block_ctx,
- next_block, superblock_tmp,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l) {
- ret = -1;
- goto out;
- }
- }
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
- btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-out:
- put_page(page);
- return ret;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
- struct btrfsic_stack_frame *sf;
-
- sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (sf)
- sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
- return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
- BUG_ON(!(NULL == sf ||
- BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
- kfree(sf);
-}
-
-static noinline_for_stack int btrfsic_process_metablock(
- struct btrfsic_state *state,
- struct btrfsic_block *const first_block,
- struct btrfsic_block_data_ctx *const first_block_ctx,
- int first_limit_nesting, int force_iodone_flag)
-{
- struct btrfsic_stack_frame initial_stack_frame = { 0 };
- struct btrfsic_stack_frame *sf;
- struct btrfsic_stack_frame *next_stack;
- struct btrfs_header *const first_hdr =
- (struct btrfs_header *)first_block_ctx->datav[0];
-
- BUG_ON(!first_hdr);
- sf = &initial_stack_frame;
- sf->error = 0;
- sf->i = -1;
- sf->limit_nesting = first_limit_nesting;
- sf->block = first_block;
- sf->block_ctx = first_block_ctx;
- sf->next_block = NULL;
- sf->hdr = first_hdr;
- sf->prev = NULL;
-
-continue_with_new_stack_frame:
- sf->block->generation = btrfs_stack_header_generation(sf->hdr);
- if (0 == sf->hdr->level) {
- struct btrfs_leaf *const leafhdr =
- (struct btrfs_leaf *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("leaf %llu items %d generation %llu owner %llu\n",
- sf->block_ctx->start, sf->nr,
- btrfs_stack_header_generation(
- &leafhdr->header),
- btrfs_stack_header_owner(
- &leafhdr->header));
- }
-
-continue_with_current_leaf_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_item disk_item;
- u32 disk_item_offset =
- (uintptr_t)(leafhdr->items + sf->i) -
- (uintptr_t)leafhdr;
- struct btrfs_disk_key *disk_key;
- u8 type;
- u32 item_offset;
- u32 item_size;
-
- if (disk_item_offset + sizeof(struct btrfs_item) >
- sf->block_ctx->len) {
-leaf_item_out_of_bounce_error:
- pr_info(
- "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(sf->block_ctx,
- &disk_item,
- disk_item_offset,
- sizeof(struct btrfs_item));
- item_offset = btrfs_stack_item_offset(&disk_item);
- item_size = btrfs_stack_item_size(&disk_item);
- disk_key = &disk_item.key;
- type = btrfs_disk_key_type(disk_key);
-
- if (BTRFS_ROOT_ITEM_KEY == type) {
- struct btrfs_root_item root_item;
- u32 root_item_offset;
- u64 next_bytenr;
-
- root_item_offset = item_offset +
- offsetof(struct btrfs_leaf, items);
- if (root_item_offset + item_size >
- sf->block_ctx->len)
- goto leaf_item_out_of_bounce_error;
- btrfsic_read_from_block_data(
- sf->block_ctx, &root_item,
- root_item_offset,
- item_size);
- next_bytenr = btrfs_root_bytenr(&root_item);
-
- sf->error =
- btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- disk_key,
- btrfs_root_generation(
- &root_item));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack =
- btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- btrfsic_release_block_ctx(
- &sf->
- next_block_ctx);
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx =
- &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
- } else if (BTRFS_EXTENT_DATA_KEY == type &&
- state->include_extent_data) {
- sf->error = btrfsic_handle_extent_data(
- state,
- sf->block,
- sf->block_ctx,
- item_offset,
- force_iodone_flag);
- if (sf->error)
- goto one_stack_frame_backwards;
- }
-
- goto continue_with_current_leaf_stack_frame;
- }
- } else {
- struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("node %llu level %d items %d generation %llu owner %llu\n",
- sf->block_ctx->start,
- nodehdr->header.level, sf->nr,
- btrfs_stack_header_generation(
- &nodehdr->header),
- btrfs_stack_header_owner(
- &nodehdr->header));
- }
-
-continue_with_current_node_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_key_ptr key_ptr;
- u32 key_ptr_offset;
- u64 next_bytenr;
-
- key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
- (uintptr_t)nodehdr;
- if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
- sf->block_ctx->len) {
- pr_info(
- "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(
- sf->block_ctx, &key_ptr, key_ptr_offset,
- sizeof(struct btrfs_key_ptr));
- next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
-
- sf->error = btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- &key_ptr.key,
- btrfs_stack_key_generation(&key_ptr));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx = &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
-
- goto continue_with_current_node_stack_frame;
- }
- }
-
-one_stack_frame_backwards:
- if (NULL != sf->prev) {
- struct btrfsic_stack_frame *const prev = sf->prev;
-
- /* the one for the initial block is freed in the caller */
- btrfsic_release_block_ctx(sf->block_ctx);
-
- if (sf->error) {
- prev->error = sf->error;
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto one_stack_frame_backwards;
- }
-
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto continue_with_new_stack_frame;
- } else {
- BUG_ON(&initial_stack_frame != sf);
- }
-
- return sf->error;
-}
-
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dstv, u32 offset, size_t len)
-{
- size_t cur;
- size_t pgoff;
- char *kaddr;
- char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(block_ctx->start);
- unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
-
- WARN_ON(offset + len > block_ctx->len);
- pgoff = offset_in_page(start_offset + offset);
-
- while (len > 0) {
- cur = min(len, ((size_t)PAGE_SIZE - pgoff));
- BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
- kaddr = block_ctx->datav[i];
- memcpy(dst, kaddr + pgoff, cur);
-
- dst += cur;
- len -= cur;
- pgoff = 0;
- i++;
- }
-}
-
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block *next_block = NULL;
- int ret;
- struct btrfsic_block_link *l;
- int did_alloc_block_link;
- int block_was_created;
-
- *next_blockp = NULL;
- if (0 == *num_copiesp) {
- *num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, *num_copiesp);
- *mirror_nump = 1;
- }
-
- if (*mirror_nump > *num_copiesp)
- return 0;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
- *mirror_nump);
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- next_block_ctx, *mirror_nump);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, *mirror_nump);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(state,
- next_block_ctx, "referenced ",
- 1, force_iodone_flag,
- !force_iodone_flag,
- *mirror_nump,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
- if (block_was_created) {
- l = NULL;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr))
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block),
- next_block->logical_bytenr);
- else
- pr_info(
- "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block));
- }
- next_block->logical_bytenr = next_bytenr;
-
- next_block->mirror_num = *mirror_nump;
- l = btrfsic_block_link_hashtable_lookup(
- next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_link_hashtable);
- }
-
- next_block->disk_key = *disk_key;
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (NULL == l) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- did_alloc_block_link = 1;
- l->block_ref_to = next_block;
- l->block_ref_from = block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- did_alloc_block_link = 0;
- if (0 == limit_nesting) {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
- }
-
- if (limit_nesting > 0 && did_alloc_block_link) {
- ret = btrfsic_read_block(state, next_block_ctx);
- if (ret < (int)next_block_ctx->len) {
- pr_info("btrfsic: read block @logical %llu failed!\n",
- next_bytenr);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- *next_blockp = next_block;
- } else {
- *next_blockp = NULL;
- }
- (*mirror_nump)++;
-
- return 0;
-}
-
-static int btrfsic_handle_extent_data(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_file_extent_item file_extent_item;
- u64 file_extent_item_offset;
- u64 next_bytenr;
- u64 num_bytes;
- u64 generation;
- struct btrfsic_block_link *l;
- int ret;
-
- file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
- item_offset;
- if (file_extent_item_offset +
- offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
-
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- offsetof(struct btrfs_file_extent_item, disk_num_bytes));
- if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(
- &file_extent_item));
- return 0;
- }
-
- if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- sizeof(struct btrfs_file_extent_item));
- next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
- if (btrfs_stack_file_extent_compression(&file_extent_item) ==
- BTRFS_COMPRESS_NONE) {
- next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
- num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
- } else {
- num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
- }
- generation = btrfs_stack_file_extent_generation(&file_extent_item);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
- btrfs_stack_file_extent_offset(&file_extent_item),
- num_bytes);
- while (num_bytes > 0) {
- u32 chunk_len;
- int num_copies;
- int mirror_num;
-
- if (num_bytes > state->datablock_size)
- chunk_len = state->datablock_size;
- else
- chunk_len = num_bytes;
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->datablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfsic_block *next_block;
- int block_was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
- mirror_num);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
- next_bytenr, chunk_len);
- ret = btrfsic_map_block(state, next_bytenr,
- chunk_len, &next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &next_block_ctx,
- "referenced ",
- 0,
- force_iodone_flag,
- !force_iodone_flag,
- mirror_num,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&next_block_ctx);
- return -1;
- }
- if (!block_was_created) {
- if ((state->print_mask &
- BTRFSIC_PRINT_MASK_VERBOSE) &&
- next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
- next_bytenr,
- next_block_ctx.dev->bdev,
- next_block_ctx.dev_bytenr,
- mirror_num,
- next_block->logical_bytenr);
- }
- next_block->logical_bytenr = next_bytenr;
- next_block->mirror_num = mirror_num;
- }
-
- l = btrfsic_block_link_lookup_or_add(state,
- &next_block_ctx,
- next_block, block,
- generation);
- btrfsic_release_block_ctx(&next_block_ctx);
- if (NULL == l)
- return -1;
- }
-
- next_bytenr += chunk_len;
- num_bytes -= chunk_len;
- }
-
- return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int ret;
- u64 length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap, *map;
- struct btrfs_device *device;
-
- length = len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
- NULL, &mirror_num, 0);
- if (ret) {
- block_ctx_out->start = 0;
- block_ctx_out->dev_bytenr = 0;
- block_ctx_out->len = 0;
- block_ctx_out->dev = NULL;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- return ret;
- }
-
- if (bioc)
- map = &bioc->stripes[0];
- else
- map = &smap;
-
- device = map->dev;
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
- !device->bdev || !device->name)
- block_ctx_out->dev = NULL;
- else
- block_ctx_out->dev = btrfsic_dev_state_lookup(
- device->bdev->bd_dev);
- block_ctx_out->dev_bytenr = map->physical;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- kfree(bioc);
- if (NULL == block_ctx_out->dev) {
- ret = -ENXIO;
- pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
- }
-
- return ret;
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
- if (block_ctx->mem_to_free) {
- unsigned int num_pages;
-
- BUG_ON(!block_ctx->datav);
- BUG_ON(!block_ctx->pagev);
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- /* Pages must be unmapped in reverse order */
- while (num_pages > 0) {
- num_pages--;
- if (block_ctx->datav[num_pages])
- block_ctx->datav[num_pages] = NULL;
- if (block_ctx->pagev[num_pages]) {
- __free_page(block_ctx->pagev[num_pages]);
- block_ctx->pagev[num_pages] = NULL;
- }
- }
-
- kfree(block_ctx->mem_to_free);
- block_ctx->mem_to_free = NULL;
- block_ctx->pagev = NULL;
- block_ctx->datav = NULL;
- }
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx)
-{
- unsigned int num_pages;
- unsigned int i;
- size_t size;
- u64 dev_bytenr;
- int ret;
-
- BUG_ON(block_ctx->datav);
- BUG_ON(block_ctx->pagev);
- BUG_ON(block_ctx->mem_to_free);
- if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
- pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
- block_ctx->dev_bytenr);
- return -1;
- }
-
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
- block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
- if (!block_ctx->mem_to_free)
- return -ENOMEM;
- block_ctx->datav = block_ctx->mem_to_free;
- block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
- if (ret)
- return ret;
-
- dev_bytenr = block_ctx->dev_bytenr;
- for (i = 0; i < num_pages;) {
- struct bio *bio;
- unsigned int j;
-
- bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
- REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
-
- for (j = i; j < num_pages; j++) {
- ret = bio_add_page(bio, block_ctx->pagev[j],
- PAGE_SIZE, 0);
- if (PAGE_SIZE != ret)
- break;
- }
- if (j == i) {
- pr_info("btrfsic: error, failed to add a single page!\n");
- return -1;
- }
- if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %pg!\n",
- block_ctx->start, block_ctx->dev->bdev);
- bio_put(bio);
- return -1;
- }
- bio_put(bio);
- dev_bytenr += (j - i) * PAGE_SIZE;
- i = j;
- }
- for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
-
- return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
- const struct btrfsic_block *b_all;
-
- BUG_ON(NULL == state);
-
- pr_info("all_blocks_list:\n");
- list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
- const struct btrfsic_block_link *l;
-
- pr_info("%c-block @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
-
- list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- }
-
- list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- }
-
- pr_info("\n");
- }
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static noinline_for_stack int btrfsic_test_for_metadata(
- struct btrfsic_state *state,
- char **datav, unsigned int num_pages)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct btrfs_header *h;
- u8 csum[BTRFS_CSUM_SIZE];
- unsigned int i;
-
- if (num_pages * PAGE_SIZE < state->metablock_size)
- return 1; /* not metadata */
- num_pages = state->metablock_size >> PAGE_SHIFT;
- h = (struct btrfs_header *)datav[0];
-
- if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
- return 1;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
-
- for (i = 0; i < num_pages; i++) {
- u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
- size_t sublen = i ? PAGE_SIZE :
- (PAGE_SIZE - BTRFS_CSUM_SIZE);
-
- crypto_shash_update(shash, data, sublen);
- }
- crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, fs_info->csum_size))
- return 1;
-
- return 0; /* is metadata */
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- blk_opf_t submit_bio_bh_rw)
-{
- int is_metadata;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx block_ctx;
- int ret;
- struct btrfsic_state *state = dev_state->state;
- struct block_device *bdev = dev_state->bdev;
- unsigned int processed_len;
-
- if (NULL != bio_is_patched)
- *bio_is_patched = 0;
-
-again:
- if (num_pages == 0)
- return;
-
- processed_len = 0;
- is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
- num_pages));
-
- block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
- &state->block_hashtable);
- if (NULL != block) {
- u64 bytenr = 0;
- struct btrfsic_block_link *l, *tmp;
-
- if (block->is_superblock) {
- bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
- mapped_datav[0]);
- if (num_pages * PAGE_SIZE <
- BTRFS_SUPER_INFO_SIZE) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- is_metadata = 1;
- BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
- processed_len = BTRFS_SUPER_INFO_SIZE;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
- pr_info("[before new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- }
- if (is_metadata) {
- if (!block->is_superblock) {
- if (num_pages * PAGE_SIZE <
- state->metablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
- dev_state,
- dev_bytenr);
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- pr_info(
-"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- bytenr, dev_state->bdev,
- dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state,
- block),
- block->logical_bytenr);
- else
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev,
- dev_bytenr, block->mirror_num,
- btrfsic_get_block_type(state,
- block));
- }
- block->logical_bytenr = bytenr;
- } else {
- if (num_pages * PAGE_SIZE <
- state->datablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->datablock_size;
- bytenr = block->logical_bytenr;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
- list_empty(&block->ref_to_list) ? ' ' : '!',
- list_empty(&block->ref_from_list) ? ' ' : '!');
- if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_disk_key_objectid(&block->disk_key),
- block->disk_key.type,
- btrfs_disk_key_offset(&block->disk_key),
- btrfs_stack_header_generation(
- (struct btrfs_header *) mapped_datav[0]),
- state->max_superblock_generation);
- btrfsic_dump_tree(state);
- }
-
- if (!block->is_iodone && !block->never_written) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_stack_header_generation(
- (struct btrfs_header *)
- mapped_datav[0]));
- /* it would not be safe to go on */
- btrfsic_dump_tree(state);
- goto continue_loop;
- }
-
- /*
- * Clear all references of this block. Do not free
- * the block itself even if is not referenced anymore
- * because it still carries valuable information
- * like whether it was ever written and IO completed.
- */
- list_for_each_entry_safe(l, tmp, &block->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
- l->ref_cnt--;
- if (0 == l->ref_cnt) {
- list_del(&l->node_ref_to);
- list_del(&l->node_ref_from);
- btrfsic_block_link_hashtable_remove(l);
- btrfsic_block_link_free(l);
- }
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- if (is_metadata || state->include_extent_data) {
- block->never_written = 0;
- block->iodone_w_error = 0;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private =
- bio->bi_private;
- block->orig_bio_end_io =
- bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- }
-
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (is_metadata) {
- block->logical_bytenr = bytenr;
- block->is_metadata = 1;
- if (block->is_superblock) {
- BUG_ON(PAGE_SIZE !=
- BTRFS_SUPER_INFO_SIZE);
- ret = btrfsic_process_written_superblock(
- state,
- block,
- (struct btrfs_super_block *)
- mapped_datav[0]);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
- pr_info("[after new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- } else {
- block->mirror_num = 0; /* unknown */
- ret = btrfsic_process_metablock(
- state,
- block,
- &block_ctx,
- 0, 0);
- }
- if (ret)
- pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- } else {
- block->is_metadata = 0;
- block->mirror_num = 0; /* unknown */
- block->generation = BTRFSIC_GENERATION_UNKNOWN;
- if (!state->include_extent_data
- && list_empty(&block->ref_from_list)) {
- /*
- * disk block is overwritten with extent
- * data (not meta data) and we are configured
- * to not include extent data: take the
- * chance and free the block's memory
- */
- btrfsic_block_hashtable_remove(block);
- list_del(&block->all_blocks_node);
- btrfsic_block_free(block);
- }
- }
- btrfsic_release_block_ctx(&block_ctx);
- } else {
- /* block has not been found in hash table */
- u64 bytenr;
-
- if (!is_metadata) {
- processed_len = state->datablock_size;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block (%pg/%llu/?) !found in hash table, D\n",
- dev_state->bdev, dev_bytenr);
- if (!state->include_extent_data) {
- /* ignore that written D block */
- goto continue_loop;
- }
-
- /* this is getting ugly for the
- * include_extent_data case... */
- bytenr = 0; /* unknown */
- } else {
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
- dev_bytenr);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
- bytenr, dev_state->bdev, dev_bytenr);
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- block = btrfsic_block_alloc();
- if (NULL == block) {
- btrfsic_release_block_ctx(&block_ctx);
- goto continue_loop;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = dev_bytenr;
- block->logical_bytenr = bytenr;
- block->is_metadata = is_metadata;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->mirror_num = 0; /* unknown */
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
- is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
- if (is_metadata) {
- ret = btrfsic_process_metablock(state, block,
- &block_ctx, 0, 0);
- if (ret)
- pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
-continue_loop:
- BUG_ON(!processed_len);
- dev_bytenr += processed_len;
- mapped_datav += processed_len >> PAGE_SHIFT;
- num_pages -= processed_len >> PAGE_SHIFT;
- goto again;
-}
-
-static void btrfsic_bio_end_io(struct bio *bp)
-{
- struct btrfsic_block *block = bp->bi_private;
- int iodone_w_error;
-
- /* mutex is not held! This is not save if IO is not yet completed
- * on umount */
- iodone_w_error = 0;
- if (bp->bi_status)
- iodone_w_error = 1;
-
- BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_private;
- bp->bi_end_io = block->orig_bio_end_io;
-
- do {
- struct btrfsic_block *next_block;
- struct btrfsic_dev_state *const dev_state = block->dev_state;
-
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
- bp->bi_status,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- next_block = block->next_in_same_bio;
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %pg flush_gen=%llu\n",
- dev_state->bdev,
- dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is
- * on disk */
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- block = next_block;
- } while (NULL != block);
-
- bp->bi_end_io(bp);
-}
-
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const superblock,
- struct btrfs_super_block *const super_hdr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int pass;
-
- superblock->generation = btrfs_super_generation(super_hdr);
- if (!(superblock->generation > state->max_superblock_generation ||
- 0 == state->max_superblock_generation)) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
-
- state->max_superblock_generation =
- btrfs_super_generation(super_hdr);
- state->latest_superblock = superblock;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int ret;
- u64 next_bytenr;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key = {0};
-
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_ITEM_KEY);
- btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
-
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "root ";
- next_bytenr = btrfs_super_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "chunk ";
- next_bytenr = btrfs_super_chunk_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "log ";
- next_bytenr = btrfs_super_log_root(super_hdr);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- BTRFS_SUPER_INFO_SIZE);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- int was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
- ret = btrfsic_map_block(state, next_bytenr,
- BTRFS_SUPER_INFO_SIZE,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- additional_string,
- 1, 0, 1,
- mirror_num,
- &was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- return -1;
- }
-
- next_block->disk_key = tmp_disk_key;
- if (was_created)
- next_block->generation =
- BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- next_block,
- superblock,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l)
- return -1;
- }
- }
-
- if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
- btrfsic_dump_tree(state);
-
- return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
- int ret = 0;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /*
- * Note that this situation can happen and does not
- * indicate an error in regular cases. It happens
- * when disk blocks are freed and later reused.
- * The check-integrity module is not aware of any
- * block free operations, it just recognizes block
- * write operations. Therefore it keeps the linkage
- * information for a block until a block is
- * rewritten. This can temporarily cause incorrect
- * and even circular linkage information. This
- * causes no harm unless such blocks are referenced
- * by the most recent super block.
- */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 1).\n");
-
- return ret;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack
- * space is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- if (l->block_ref_to->never_written) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (!l->block_ref_to->is_iodone) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->block_ref_to->iodone_w_error) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->parent_generation !=
- l->block_ref_to->generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->parent_generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->block_ref_to->generation) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num,
- l->block_ref_to->generation,
- l->parent_generation);
- ret = -1;
- } else if (l->block_ref_to->flush_gen >
- l->block_ref_to->dev_state->last_flush_gen) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num, block->flush_gen,
- l->block_ref_to->dev_state->last_flush_gen);
- ret = -1;
- } else if (-1 == btrfsic_check_all_ref_blocks(state,
- l->block_ref_to,
- recursion_level +
- 1)) {
- ret = -1;
- }
- }
-
- return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
- const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /* refer to comment at "abort cyclic linkage (case 1)" */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 2).\n");
-
- return 0;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- if (l->block_ref_from->is_superblock &&
- state->latest_superblock->dev_bytenr ==
- l->block_ref_from->dev_bytenr &&
- state->latest_superblock->dev_state->bdev ==
- l->block_ref_from->dev_state->bdev)
- return 1;
- else if (btrfsic_is_block_ref_by_superblock(state,
- l->block_ref_from,
- recursion_level +
- 1))
- return 1;
- }
-
- return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block)
-{
- if (block->is_superblock &&
- state->latest_superblock->dev_bytenr == block->dev_bytenr &&
- state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
- return 'S';
- else if (block->is_superblock)
- return 's';
- else if (block->is_metadata)
- return 'M';
- else
- return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
- btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level)
-{
- const struct btrfsic_block_link *l;
- int indent_add;
- static char buf[80];
- int cursor_position;
-
- /*
- * Should better fill an on-stack buffer with a complete line and
- * dump it at once when it is time to print a newline character.
- */
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- return;
- }
- printk(buf);
- indent_level += indent_add;
- if (list_empty(&block->ref_to_list)) {
- printk("\n");
- return;
- }
- if (block->mirror_num > 1 &&
- !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
- printk(" [...]\n");
- return;
- }
-
- cursor_position = indent_level;
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- while (cursor_position < indent_level) {
- printk(" ");
- cursor_position++;
- }
- if (l->ref_cnt > 1)
- indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
- else
- indent_add = sprintf(buf, " --> ");
- if (indent_level + indent_add >
- BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- cursor_position = 0;
- continue;
- }
-
- printk(buf);
-
- btrfsic_dump_tree_sub(state, l->block_ref_to,
- indent_level + indent_add);
- cursor_position = 0;
- }
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation)
-{
- struct btrfsic_block_link *l;
-
- l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- from_block->dev_state->bdev,
- from_block->dev_bytenr,
- &state->block_link_hashtable);
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (!l)
- return NULL;
-
- l->block_ref_to = next_block;
- l->block_ref_from = from_block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &from_block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
-
- return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created)
-{
- struct btrfsic_block *block;
-
- block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_hashtable);
- if (NULL == block) {
- struct btrfsic_dev_state *dev_state;
-
- block = btrfsic_block_alloc();
- if (!block)
- return NULL;
-
- dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
- if (NULL == dev_state) {
- pr_info("btrfsic: error, lookup dev_state failed!\n");
- btrfsic_block_free(block);
- return NULL;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = block_ctx->dev_bytenr;
- block->logical_bytenr = block_ctx->start;
- block->is_metadata = is_metadata;
- block->is_iodone = is_iodone;
- block->never_written = never_written;
- block->mirror_num = mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
- additional_string,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
- if (NULL != was_created)
- *was_created = 1;
- } else {
- if (NULL != was_created)
- *was_created = 0;
- }
-
- return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block_data_ctx block_ctx;
- int num_copies;
- int mirror_num;
- int match = 0;
- int ret;
-
- num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, state->metablock_size,
- &block_ctx, mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
- bytenr, mirror_num);
- continue;
- }
-
- if (dev_state->bdev == block_ctx.dev->bdev &&
- dev_bytenr == block_ctx.dev_bytenr) {
- match++;
- btrfsic_release_block_ctx(&block_ctx);
- break;
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
- if (WARN_ON(!match)) {
- pr_info(
-"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
- bytenr, dev_state->bdev, dev_bytenr);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr,
- state->metablock_size,
- &block_ctx, mirror_num);
- if (ret)
- continue;
-
- pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
- bytenr, block_ctx.dev->bdev,
- block_ctx.dev_bytenr, mirror_num);
- }
- }
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
-{
- return btrfsic_dev_state_hashtable_lookup(dev,
- &btrfsic_dev_state_hashtable);
-}
-
-static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- unsigned int segs = bio_segments(bio);
- u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
- u64 cur_bytenr = dev_bytenr;
- struct bvec_iter iter;
- struct bio_vec bvec;
- char **mapped_datav;
- int bio_is_patched = 0;
- int i = 0;
-
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info(
-"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- return;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
-
- btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
- bio, &bio_is_patched, bio->bi_opf);
- kfree(mapped_datav);
-}
-
-static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
-
- if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- } else if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE))) {
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- }
-}
-
-void btrfsic_check_bio(struct bio *bio)
-{
- struct btrfsic_dev_state *dev_state;
-
- if (!btrfsic_is_initialized)
- return;
-
- /*
- * We can be called before btrfsic_mount, so there might not be a
- * dev_state.
- */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- mutex_lock(&btrfsic_mutex);
- if (dev_state) {
- if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
- btrfsic_check_write_bio(bio, dev_state);
- else if (bio->bi_opf & REQ_PREFLUSH)
- btrfsic_check_flush_bio(bio, dev_state);
- }
- mutex_unlock(&btrfsic_mutex);
-}
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask)
-{
- int ret;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!PAGE_ALIGNED(fs_info->nodesize)) {
- pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->nodesize, PAGE_SIZE);
- return -1;
- }
- if (!PAGE_ALIGNED(fs_info->sectorsize)) {
- pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->sectorsize, PAGE_SIZE);
- return -1;
- }
- state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
-
- if (!btrfsic_is_initialized) {
- mutex_init(&btrfsic_mutex);
- btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
- btrfsic_is_initialized = 1;
- }
- mutex_lock(&btrfsic_mutex);
- state->fs_info = fs_info;
- state->print_mask = print_mask;
- state->include_extent_data = including_extent_data;
- state->metablock_size = fs_info->nodesize;
- state->datablock_size = fs_info->sectorsize;
- INIT_LIST_HEAD(&state->all_blocks_list);
- btrfsic_block_hashtable_init(&state->block_hashtable);
- btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
- state->max_superblock_generation = 0;
- state->latest_superblock = NULL;
-
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_alloc();
- if (NULL == ds) {
- mutex_unlock(&btrfsic_mutex);
- return -ENOMEM;
- }
- ds->bdev = device->bdev;
- ds->state = state;
- btrfsic_dev_state_hashtable_add(ds,
- &btrfsic_dev_state_hashtable);
- }
-
- ret = btrfsic_process_superblock(state, fs_devices);
- if (0 != ret) {
- mutex_unlock(&btrfsic_mutex);
- btrfsic_unmount(fs_devices);
- return ret;
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
- btrfsic_dump_database(state);
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
- btrfsic_dump_tree(state);
-
- mutex_unlock(&btrfsic_mutex);
- return 0;
-}
-
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
-{
- struct btrfsic_block *b_all, *tmp_all;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!btrfsic_is_initialized)
- return;
-
- mutex_lock(&btrfsic_mutex);
-
- state = NULL;
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_hashtable_lookup(
- device->bdev->bd_dev,
- &btrfsic_dev_state_hashtable);
- if (NULL != ds) {
- state = ds->state;
- btrfsic_dev_state_hashtable_remove(ds);
- btrfsic_dev_state_free(ds);
- }
- }
-
- if (NULL == state) {
- pr_info("btrfsic: error, cannot find state information on umount!\n");
- mutex_unlock(&btrfsic_mutex);
- return;
- }
-
- /*
- * Don't care about keeping the lists' state up to date,
- * just free all memory that was allocated dynamically.
- * Free the blocks and the block_links.
- */
- list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
- all_blocks_node) {
- struct btrfsic_block_link *l, *tmp;
-
- list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
-
- l->ref_cnt--;
- if (0 == l->ref_cnt)
- btrfsic_block_link_free(l);
- }
-
- if (b_all->is_iodone || b_all->never_written)
- btrfsic_block_free(b_all);
- else
- pr_info(
-"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
- }
-
- mutex_unlock(&btrfsic_mutex);
-
- kvfree(state);
-}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
deleted file mode 100644
index e4c8aed7996f..000000000000
--- a/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-#ifndef BTRFS_CHECK_INTEGRITY_H
-#define BTRFS_CHECK_INTEGRITY_H
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_check_bio(struct bio *bio);
-#else
-static inline void btrfsic_check_bio(struct bio *bio) { }
-#endif
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e6acf09a1507..65d883da86c6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,12 +20,11 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <linux/shrinker.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "fs.h"
-#include "disk-io.h"
-#include "transaction.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "ordered-data.h"
@@ -33,8 +32,7 @@
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
-#include "zoned.h"
-#include "file-item.h"
+#include "messages.h"
#include "super.h"
static struct bio_set btrfs_compressed_bioset;
@@ -92,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start, struct page **pages,
- unsigned long *out_pages, unsigned long *total_in,
- unsigned long *total_out)
+ struct address_space *mapping, u64 start,
+ struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zlib_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return lzo_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zstd_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -117,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws,
* Not a big deal, just need to inform caller that we
* haven't allocated any pages yet.
*/
- *out_pages = 0;
+ *out_folios = 0;
return -E2BIG;
}
}
@@ -140,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
+ const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
- case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
@@ -160,16 +158,110 @@ static int compression_decompress(int type, struct list_head *ws,
}
}
-static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+static void btrfs_free_compressed_folios(struct compressed_bio *cb)
{
- for (unsigned int i = 0; i < cb->nr_pages; i++)
- put_page(cb->compressed_pages[i]);
- kfree(cb->compressed_pages);
+ for (unsigned int i = 0; i < cb->nr_folios; i++)
+ btrfs_free_compr_folio(cb->compressed_folios[i]);
+ kfree(cb->compressed_folios);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+ struct shrinker *shrinker;
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+ int ret;
+
+ /*
+ * We must not read the values more than once if 'ret' gets expanded in
+ * the return statement so we don't accidentally return a negative
+ * number, even if the first condition finds it positive.
+ */
+ ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+ return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+ struct list_head remove;
+ struct list_head *tmp, *next;
+ int freed;
+
+ if (compr_pool.count == 0)
+ return SHRINK_STOP;
+
+ INIT_LIST_HEAD(&remove);
+
+ /* For now, just simply drain the whole list. */
+ spin_lock(&compr_pool.lock);
+ list_splice_init(&compr_pool.list, &remove);
+ freed = compr_pool.count;
+ compr_pool.count = 0;
+ spin_unlock(&compr_pool.lock);
+
+ list_for_each_safe(tmp, next, &remove) {
+ struct page *page = list_entry(tmp, struct page, lru);
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+ }
+
+ return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct folio *btrfs_alloc_compr_folio(void)
+{
+ struct folio *folio = NULL;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > 0) {
+ folio = list_first_entry(&compr_pool.list, struct folio, lru);
+ list_del_init(&folio->lru);
+ compr_pool.count--;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (folio)
+ return folio;
+
+ return folio_alloc(GFP_NOFS, 0);
+}
+
+void btrfs_free_compr_folio(struct folio *folio)
+{
+ bool do_free = false;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > compr_pool.thresh) {
+ do_free = true;
+ } else {
+ list_add(&folio->lru, &compr_pool.list);
+ compr_pool.count++;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (!do_free)
+ return;
+
+ ASSERT(folio_ref_count(folio) == 1);
+ folio_put(folio);
+}
+
+static void end_bbio_compressed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -177,7 +269,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
if (!status)
status = errno_to_blk_status(btrfs_decompress_bio(cb));
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
btrfs_bio_end_io(cb->orig_bbio, status);
bio_put(&bbio->bio);
}
@@ -189,16 +281,16 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+ const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (errno)
- mapping_set_error(inode->i_mapping, errno);
+ if (error)
+ mapping_set_error(inode->i_mapping, error);
folio_batch_init(&fbatch);
while (index <= end_index) {
@@ -211,8 +303,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
- cb->start, cb->len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio,
+ cb->start, cb->len);
}
folio_batch_release(&fbatch);
}
@@ -231,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
end_compressed_writeback(cb);
/* Note, our inode could be gone now */
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
bio_put(&cb->bbio.bio);
}
@@ -242,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -250,17 +342,19 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
-static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
while (offset < cb->compressed_len) {
+ int ret;
u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
/* Maximum compressed extent is smaller than bio size limit. */
- __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
+ len, 0);
+ ASSERT(ret);
offset += len;
}
}
@@ -275,12 +369,12 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
* the end io hooks.
*/
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
+ struct folio **compressed_folios,
+ unsigned int nr_folios,
blk_opf_t write_flags,
bool writeback)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct compressed_bio *cb;
@@ -289,19 +383,19 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_compressed_bio_write);
+ end_bbio_compressed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
- cb->compressed_pages = compressed_pages;
+ cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
- cb->nr_pages = nr_pages;
+ cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
- btrfs_submit_bio(&cb->bbio, 0);
+ btrfs_submit_bbio(&cb->bbio, 0);
}
/*
@@ -320,13 +414,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
struct compressed_bio *cb,
int *memstall, unsigned long *pflags)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
int ret;
- struct page *page;
+ struct folio *folio;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
@@ -346,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* This makes readahead less effective, so here disable readahead for
* subpage for now, until full compressed write is supported.
*/
- if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ if (fs_info->sectorsize < PAGE_SIZE)
return 0;
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -359,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- page = xa_load(&mapping->i_pages, pg_index);
- if (page && !xa_is_value(page)) {
- sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+ if (!IS_ERR(folio)) {
+ u64 folio_sz = folio_size(folio);
+ u64 offset = offset_in_folio(folio, cur);
+
+ folio_put(folio);
+ sectors_missed += (folio_sz - offset) >>
fs_info->sectorsize_bits;
/* Beyond threshold, no need to continue */
@@ -372,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* Jump to next page start as we already have page for
* current offset.
*/
- cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ cur += (folio_sz - offset);
continue;
}
- page = __page_cache_alloc(mapping_gfp_constraint(mapping,
- ~__GFP_FS));
- if (!page)
+ folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
+ ~__GFP_FS), 0);
+ if (!folio)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
- put_page(page);
+ if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
/* There is already a page, skip to page end */
- cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ cur += folio_size(folio);
+ folio_put(folio);
continue;
}
- if (!*memstall && PageWorkingset(page)) {
+ if (!*memstall && folio_test_workingset(folio)) {
psi_memstall_enter(pflags);
*memstall = 1;
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
@@ -413,31 +511,32 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
if (!em || cur < em->start ||
(cur + fs_info->sectorsize > extent_map_end(em)) ||
- (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) {
+ (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
+ unlock_extent(tree, cur, page_end, NULL);
- if (page->index == end_index) {
- size_t zero_offset = offset_in_page(isize);
+ if (folio->index == end_index) {
+ size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
int zeros;
- zeros = PAGE_SIZE - zero_offset;
- memzero_page(page, zero_offset, zeros);
+ zeros = folio_size(folio) - zero_offset;
+ folio_zero_range(folio, zero_offset, zeros);
}
}
- ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
- if (ret != add_size) {
- unlock_extent(tree, cur, page_end, NULL);
- unlock_page(page);
- put_page(page);
+ if (!bio_add_folio(orig_bio, folio, add_size,
+ offset_in_folio(folio, cur))) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
/*
@@ -446,8 +545,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page, cur, add_size);
- put_page(page);
+ btrfs_folio_set_lock(fs_info, folio, cur, add_size);
+ folio_put(folio);
cur += add_size;
}
return 0;
@@ -489,31 +588,31 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out;
}
- ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
- compressed_len = em->block_len;
+ ASSERT(extent_map_is_compressed(em));
+ compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_compressed_bio_read);
+ end_bbio_compressed_read);
- cb->start = em->orig_start;
+ cb->start = em->start - em->offset;
em_len = em->len;
em_start = em->start;
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = em->compress_type;
+ cb->compress_type = extent_map_compression(em);
cb->orig_bbio = bbio;
free_extent_map(em);
- cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!cb->compressed_pages) {
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!cb->compressed_folios) {
ret = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -525,16 +624,16 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
/* include any pages we added in add_ra-bio_pages */
cb->len = bbio->bio.bi_iter.bi_size;
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
if (memstall)
psi_memstall_leave(&pflags);
- btrfs_submit_bio(&cb->bbio, 0);
+ btrfs_submit_bbio(&cb->bbio, 0);
return;
out_free_compressed_pages:
- kfree(cb->compressed_pages);
+ kfree(cb->compressed_folios);
out_free_bio:
bio_put(&cb->bbio.bio);
out:
@@ -881,6 +980,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
return level;
}
+/* Wrapper around find_get_page(), with extra error message. */
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret)
+{
+ struct folio *in_folio;
+
+ /*
+ * The compressed write path should have the folio locked already, thus
+ * we only need to grab one reference.
+ */
+ in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(in_folio)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_crit(inode->root->fs_info,
+ "failed to get page cache, root %lld ino %llu file offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), start);
+ return -ENOENT;
+ }
+ *in_folio_ret = in_folio;
+ return 0;
+}
+
/*
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
@@ -901,11 +1023,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
@@ -914,8 +1034,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, pages,
- out_pages, total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ out_folios, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -940,10 +1060,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
-int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
struct list_head *workspace;
const u32 sectorsize = fs_info->sectorsize;
int ret;
@@ -956,7 +1076,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
workspace = get_workspace(type, 0);
- ret = compression_decompress(type, workspace, data_in, dest_page,
+ ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
@@ -969,15 +1089,36 @@ int __init btrfs_init_compress(void)
offsetof(struct compressed_bio, bbio.bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+
+ compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+ if (!compr_pool.shrinker)
+ return -ENOMEM;
+
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+
+ spin_lock_init(&compr_pool.lock);
+ INIT_LIST_HEAD(&compr_pool.list);
+ compr_pool.count = 0;
+ /* 128K / 4K = 32, for 8 threads is 256 pages. */
+ compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+ compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+ compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+ compr_pool.shrinker->batch = 32;
+ compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+ shrinker_register(compr_pool.shrinker);
+
return 0;
}
void __cold btrfs_exit_compress(void)
{
+ /* For now scan drains all pages and does not touch the parameters. */
+ btrfs_compr_pool_scan(NULL, NULL);
+ shrinker_free(compr_pool.shrinker);
+
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
@@ -1362,11 +1503,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
/*
* Compression heuristic.
*
- * For now is's a naive and optimistic 'return true', we'll extend the logic to
- * quickly (compared to direct compression) detect data characteristics
- * (compressible/incompressible) to avoid wasting CPU time on incompressible
- * data.
- *
* The following types of analysis can be performed:
* - detect mostly zero data
* - detect data with low "byte set" size (text, etc)
@@ -1374,7 +1510,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*
* Return non-zero if the compression should be done, 0 otherwise.
*/
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
@@ -1384,7 +1520,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
ws = list_entry(ws_list, struct heuristic_ws, list);
- heuristic_collect_sample(inode, start, end, ws);
+ heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
if (sample_repeated_patterns(ws)) {
ret = 1;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 609865c94065..b6563b6a333e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -7,10 +7,18 @@
#define BTRFS_COMPRESSION_H
#include <linux/sizes.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
#include "bio.h"
+struct address_space;
+struct page;
+struct inode;
struct btrfs_inode;
struct btrfs_ordered_extent;
+struct btrfs_bio;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -33,11 +41,11 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of compressed pages in the array */
- unsigned int nr_pages;
+ /* Number of compressed folios in the array. */
+ unsigned int nr_folios;
- /* the pages with the compressed data on them */
- struct page **compressed_pages;
+ /* The folios with the compressed data on them. */
+ struct folio **compressed_folios;
/* starting offset in the inode for our pages */
u64 start;
@@ -74,28 +82,36 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
+/* @range_end must be exclusive. */
+static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
+{
+ u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
+
+ return min(range_end, page_end) - cur;
+}
+
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
-int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out);
+int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
- blk_opf_t write_flags,
- bool writeback);
+ struct folio **compressed_folios,
+ unsigned int nr_folios, blk_opf_t write_flags,
+ bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+struct folio *btrfs_alloc_compr_folio(void);
+void btrfs_free_compr_folio(struct folio *folio);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
@@ -136,35 +152,38 @@ extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
+
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret);
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
struct list_head *zlib_get_workspace(unsigned int level);
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 25c902e7556d..3ba15d9c3e88 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p)
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
* caused by external factors.
*/
-bool __cold abort_should_print_stack(int errno)
+bool __cold abort_should_print_stack(int error)
{
- switch (errno) {
+ switch (error) {
case -EIO:
case -EROFS:
case -ENOMEM:
@@ -291,7 +291,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -316,11 +316,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
+ u64 reloc_src_root = 0;
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
if (level == 0)
@@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ reloc_src_root = btrfs_header_owner(buf);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
&disk_key, level, buf->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ reloc_src_root, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -367,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
{
+ const u64 buf_gen = btrfs_header_generation(buf);
+
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node &&
- (btrfs_header_generation(buf) <=
- btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
- if (buf != root->commit_root)
- return 1;
- /*
- * An extent buffer that used to be the commit root may still be
- * shared because the tree height may have increased and it
- * became a child of a higher level root. This can happen when
- * snapshotting a subvolume created in the current transaction.
- */
- if (btrfs_header_generation(buf) == trans->transid)
- return 1;
- }
- return 0;
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return false;
+
+ if (buf == root->node)
+ return false;
+
+ if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return false;
+
+ if (buf != root->commit_root)
+ return true;
+
+ /*
+ * An extent buffer that used to be the commit root may still be shared
+ * because the tree height may have increased and it became a child of a
+ * higher level root. This can happen when snapshotting a subvolume
+ * created in the current transaction.
+ */
+ if (buf_gen == trans->transid)
+ return true;
+
+ return false;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -406,7 +417,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
u64 refs;
u64 owner;
u64 flags;
- u64 new_flags = 0;
int ret;
/*
@@ -429,7 +439,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
- &refs, &flags);
+ &refs, &flags, NULL);
if (ret)
return ret;
if (unlikely(refs == 0)) {
@@ -443,7 +453,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
refs = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
else
@@ -463,15 +473,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
if (refs > 1) {
- if ((owner == root->root_key.objectid ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ if ((owner == btrfs_root_id(root) ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1);
if (ret)
return ret;
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0);
if (ret)
return ret;
@@ -479,26 +488,22 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ BTRFS_BLOCK_FLAG_FULL_BACKREF);
+ if (ret)
+ return ret;
} else {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
if (ret)
return ret;
}
- if (new_flags != 0) {
- ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
- if (ret)
- return ret;
- }
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
@@ -526,13 +531,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
*/
-static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf,
- struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- enum btrfs_lock_nesting nest)
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -541,6 +546,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start = 0;
+ u64 reloc_src_root = 0;
if (*cow_ret == buf)
unlock_orig = 1;
@@ -550,7 +556,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
@@ -559,12 +565,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
- parent_start = parent->start;
-
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent)
+ parent_start = parent->start;
+ reloc_src_root = btrfs_header_owner(buf);
+ }
cow = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, &disk_key, level,
- search_start, empty_size, nest);
+ btrfs_root_id(root), &disk_key, level,
+ search_start, empty_size, reloc_src_root, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -576,43 +584,37 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
- btrfs_set_header_owner(cow, root->root_key.objectid);
+ btrfs_set_header_owner(cow, btrfs_root_id(root));
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
if (buf == root->node) {
WARN_ON(parent && parent != buf);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
if (ret < 0) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
atomic_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
@@ -622,20 +624,16 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
free_extent_buffer(buf);
add_root_to_dirty_list(root);
if (ret < 0) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -645,27 +643,30 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
if (ret < 0) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
+
+ trace_btrfs_cow_block(root, buf, cow);
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
+
+error_unlock_cow:
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ return ret;
}
static inline int should_cow_block(struct btrfs_trans_handle *trans,
@@ -691,7 +692,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
*/
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
!test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
@@ -699,11 +700,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
}
/*
- * cows a single block, see __btrfs_cow_block for the real work.
+ * COWs a single block, see btrfs_force_cow_block() for the real work.
* This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
-noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
@@ -711,7 +712,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
- int ret;
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
btrfs_abort_transaction(trans, -EUCLEAN);
@@ -743,7 +743,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)SZ_1G - 1);
+ search_start = round_down(buf->start, SZ_1G);
/*
* Before CoWing this block for later modification, check if it's
@@ -752,59 +752,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
- ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0, nest);
-
- trace_btrfs_cow_block(root, buf, *cow_ret);
-
- return ret;
+ return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+ cow_ret, search_start, 0, nest);
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
- * helper function for defrag to decide if two blocks pointed to by a
- * node are actually close by
- */
-static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
-{
- if (blocknr < other && other - (blocknr + blocksize) < 32768)
- return 1;
- if (blocknr > other && blocknr - (other + blocksize) < 32768)
- return 1;
- return 0;
-}
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Compare two keys, on little-endian the disk order is same as CPU order and
- * we can avoid the conversion.
- */
-static int comp_keys(const struct btrfs_disk_key *disk_key,
- const struct btrfs_key *k2)
-{
- const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
-
- return btrfs_comp_cpu_keys(k1, k2);
-}
-
-#else
-
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(const struct btrfs_disk_key *disk,
- const struct btrfs_key *k2)
-{
- struct btrfs_key k1;
-
- btrfs_disk_key_to_cpu(&k1, disk);
-
- return btrfs_comp_cpu_keys(&k1, k2);
-}
-#endif
-
-/*
* same as comp_keys only with two btrfs_key's
*/
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
@@ -825,105 +778,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
}
/*
- * this is used by the defrag code to go through all the
- * leaves pointed to by a node and reallocate them so that
- * disk order is close to key order
- */
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *cur;
- u64 blocknr;
- u64 search_start = *last_ret;
- u64 last_block = 0;
- u64 other;
- u32 parent_nritems;
- int end_slot;
- int i;
- int err = 0;
- u32 blocksize;
- int progress_passed = 0;
- struct btrfs_disk_key disk_key;
-
- /*
- * COWing must happen through a running transaction, which always
- * matches the current fs generation (it's a transaction with a state
- * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
- * into error state to prevent the commit of any transaction.
- */
- if (unlikely(trans->transaction != fs_info->running_transaction ||
- trans->transid != fs_info->generation)) {
- btrfs_abort_transaction(trans, -EUCLEAN);
- btrfs_crit(fs_info,
-"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
- parent->start, btrfs_root_id(root), trans->transid,
- fs_info->running_transaction->transid,
- fs_info->generation);
- return -EUCLEAN;
- }
-
- parent_nritems = btrfs_header_nritems(parent);
- blocksize = fs_info->nodesize;
- end_slot = parent_nritems - 1;
-
- if (parent_nritems <= 1)
- return 0;
-
- for (i = start_slot; i <= end_slot; i++) {
- int close = 1;
-
- btrfs_node_key(parent, &disk_key, i);
- if (!progress_passed && comp_keys(&disk_key, progress) < 0)
- continue;
-
- progress_passed = 1;
- blocknr = btrfs_node_blockptr(parent, i);
- if (last_block == 0)
- last_block = blocknr;
-
- if (i > 0) {
- other = btrfs_node_blockptr(parent, i - 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (!close && i < end_slot) {
- other = btrfs_node_blockptr(parent, i + 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (close) {
- last_block = blocknr;
- continue;
- }
-
- cur = btrfs_read_node_slot(parent, i);
- if (IS_ERR(cur))
- return PTR_ERR(cur);
- if (search_start == 0)
- search_start = last_block;
-
- btrfs_tree_lock(cur);
- err = __btrfs_cow_block(trans, root, cur, parent, i,
- &cur, search_start,
- min(16 * blocksize,
- (end_slot - i) * blocksize),
- BTRFS_NESTING_COW);
- if (err) {
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- break;
- }
- search_start = cur->start;
- last_block = cur->start;
- *last_ret = search_start;
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- }
- return err;
-}
-
-/*
* Search for a key in the given extent_buffer.
*
* The lower boundary for the search is specified by the slot number @first_slot.
@@ -968,7 +822,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- unsigned long oip;
+ const int unit_size = eb->folio_size;
+ unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
@@ -976,20 +831,20 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
mid = (low + high) / 2;
offset = p + mid * item_size;
- oip = offset_in_page(offset);
+ oil = get_eb_offset_in_folio(eb, offset);
- if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = get_eb_page_index(offset);
- char *kaddr = page_address(eb->pages[idx]);
+ if (oil + key_size <= unit_size) {
+ const unsigned long idx = get_eb_folio_index(eb, offset);
+ char *kaddr = folio_address(eb->folios[idx]);
- oip = get_eb_offset_in_page(eb, offset);
- tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ oil = get_eb_offset_in_folio(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oil);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
}
- ret = comp_keys(tmp, key);
+ ret = btrfs_comp_keys(tmp, key);
if (ret < 0)
low = mid + 1;
@@ -1004,19 +859,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
return 1;
}
-static void root_add_used(struct btrfs_root *root, u32 size)
+static void root_add_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) + size);
+ btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
-static void root_sub_used(struct btrfs_root *root, u32 size)
+static void root_sub_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) - size);
+ btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
@@ -1132,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* once for the path */
free_extent_buffer(mid);
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
@@ -1154,7 +1009,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -1172,7 +1027,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
@@ -1206,9 +1061,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
goto out;
}
- root_sub_used(root, right->len);
- ret = btrfs_free_tree_block(trans, btrfs_root_id(root), right,
- 0, 1);
+ root_sub_used_bytes(root);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
+ right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
if (ret < 0) {
@@ -1268,7 +1123,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = NULL;
goto out;
}
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
@@ -1364,7 +1219,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1424,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1670,7 +1525,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
check.has_first_key = true;
check.level = parent_level - 1;
check.transid = gen;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1713,12 +1568,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
- return -EIO;
- }
- if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EUCLEAN;
+ return ret;
}
if (unlock_up)
@@ -2038,7 +1888,7 @@ static int search_leaf(struct btrfs_trans_handle *trans,
* the extent buffer's header and we have recently accessed
* the header's level field.
*/
- ret = comp_keys(&first_key, key);
+ ret = btrfs_comp_keys(&first_key, key);
if (ret < 0) {
/*
* The first key is smaller than the key we want
@@ -2123,8 +1973,8 @@ static int search_leaf(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_search_slot - look for a key in a tree and perform necessary
- * modifications to preserve tree invariants.
+ * Look for a key in a tree and perform necessary modifications to preserve
+ * tree invariants.
*
* @trans: Handle of transaction, used when modifying the tree
* @p: Holds all btree nodes along the search path
@@ -2551,7 +2401,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
- ret = comp_keys(&found_key, &orig_key);
+ ret = btrfs_comp_keys(&found_key, &orig_key);
if (ret == 0) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -2566,7 +2416,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
}
btrfs_item_key(path->nodes[0], &found_key, 0);
- ret = comp_keys(&found_key, &key);
+ ret = btrfs_comp_keys(&found_key, &key);
/*
* We might have had an item with the previous key in the tree right
* before we released our path. And after we released our path, that
@@ -2715,8 +2565,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
*
*/
static void fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_disk_key *key, int level)
+ const struct btrfs_path *path,
+ const struct btrfs_disk_key *key, int level)
{
int i;
struct extent_buffer *t;
@@ -2745,7 +2595,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
* that the new key won't break the order
*/
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const struct btrfs_key *new_key)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2757,7 +2607,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2771,7 +2621,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2811,8 +2661,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
* is correct, we only need to bother the last key of @left and the first
* key of @right.
*/
-static bool check_sibling_keys(struct extent_buffer *left,
- struct extent_buffer *right)
+static bool check_sibling_keys(const struct extent_buffer *left,
+ const struct extent_buffer *right)
{
struct btrfs_key left_last;
struct btrfs_key right_first;
@@ -3012,7 +2862,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 lower_gen;
struct extent_buffer *lower;
struct extent_buffer *c;
@@ -3029,13 +2878,13 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&lower_key, level, root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ 0, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
btrfs_set_header_nritems(c, 1);
btrfs_set_node_key(c, &lower_key, 0);
@@ -3052,6 +2901,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
if (ret2 < 0)
btrfs_abort_transaction(trans, ret2);
@@ -3080,8 +2930,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* blocknr is the block the key points to.
*/
static int insert_ptr(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_disk_key *key, u64 bytenr,
+ const struct btrfs_path *path,
+ const struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
struct extent_buffer *lower;
@@ -3177,13 +3027,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&disk_key, level, c->start, 0,
- BTRFS_NESTING_SPLIT);
+ 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
@@ -3435,7 +3285,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
@@ -3651,7 +3501,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -3929,14 +3779,14 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, 0, l->start, 0,
+ right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
+ &disk_key, 0, l->start, 0, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
if (split == 0) {
if (mid <= slot) {
@@ -4171,7 +4021,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
* the front.
*/
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 new_size, int from_end)
+ const struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4263,7 +4113,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* make the item pointed to by the path bigger, data_size is the added size.
*/
void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 data_size)
+ const struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -4448,6 +4298,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
+ *
+ * Returns: 0 on success
+ * -EEXIST if the first key already exists
+ * < 0 on other errors
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -4616,7 +4470,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- root_sub_used(root, leaf->len);
+ root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
@@ -5253,9 +5107,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
int __init btrfs_ctree_init(void)
{
- btrfs_path_cachep = kmem_cache_create("btrfs_path",
- sizeof(struct btrfs_path), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
if (!btrfs_path_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7bb4c34b984..2034d3710833 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,53 +6,26 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/completion.h>
-#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <trace/events/btrfs.h>
-#include <asm/unaligned.h>
+#include "linux/cleanup.h"
#include <linux/pagemap.h>
-#include <linux/btrfs.h>
-#include <linux/btrfs_tree.h>
-#include <linux/workqueue.h>
-#include <linux/security.h>
-#include <linux/sizes.h>
-#include <linux/dynamic_debug.h>
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/xarray.h>
#include <linux/refcount.h>
-#include <linux/crc32c.h>
-#include <linux/iomap.h>
-#include <linux/fscrypt.h>
-#include "extent-io-tree.h"
-#include "extent_io.h"
-#include "extent_map.h"
-#include "async-thread.h"
-#include "block-rsv.h"
+#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
-#include "misc.h"
#include "fs.h"
+#include "accessors.h"
+#include "extent-io-tree.h"
+struct extent_buffer;
+struct btrfs_block_rsv;
struct btrfs_trans_handle;
-struct btrfs_transaction;
-struct btrfs_pending_snapshot;
-struct btrfs_delayed_ref_root;
-struct btrfs_space_info;
struct btrfs_block_group;
-struct btrfs_ordered_sum;
-struct btrfs_ref;
-struct btrfs_bio;
-struct btrfs_ioctl_encoded_io_args;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-struct reloc_control;
/* Read ahead values for struct btrfs_path.reada */
enum {
@@ -112,6 +85,9 @@ struct btrfs_path {
unsigned int nowait:1;
};
+#define BTRFS_PATH_AUTO_FREE(path_name) \
+ struct btrfs_path *path_name __free(btrfs_free_path) = NULL
+
/*
* The state of btrfs root
*/
@@ -218,17 +194,27 @@ struct btrfs_root {
atomic_t log_commit[2];
/* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
+ /*
+ * Protected by the 'log_mutex' lock but can be read without holding
+ * that lock to avoid unnecessary lock contention, in which case it
+ * should be read using btrfs_get_root_log_transid() except if it's a
+ * log tree in which case it can be directly accessed. Updates to this
+ * field should always use btrfs_set_root_log_transid(), except for log
+ * trees where the field can be updated directly.
+ */
int log_transid;
/* No matter the commit succeeds or not*/
int log_transid_committed;
- /* Just be updated when the commit succeeds. */
+ /*
+ * Just be updated when the commit succeeds. Use
+ * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
+ * to access this field.
+ */
int last_log_commit;
pid_t log_start_pid;
u64 last_trans;
- u32 type;
-
u64 free_objectid;
struct btrfs_key defrag_progress;
@@ -239,18 +225,17 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t log_extents_lock[2];
- struct list_head logged_list[2];
-
- spinlock_t inode_lock;
- /* red-black tree that keeps track of in-memory inodes */
- struct rb_root inode_tree;
+ /*
+ * Xarray that keeps track of in-memory inodes, protected by the lock
+ * @inode_lock.
+ */
+ struct xarray inodes;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by @inode_lock.
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -326,6 +311,9 @@ struct btrfs_root {
/* Used only by log trees, when logging csum items */
struct extent_io_tree log_csum_range;
+ /* Used in simple quotas, track root during relocation. */
+ u64 relocation_src_root;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -352,6 +340,55 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root)
return root->root_key.objectid;
}
+static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->log_transid);
+}
+
+static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
+{
+ WRITE_ONCE(root->log_transid, log_transid);
+}
+
+static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_log_commit);
+}
+
+static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
+{
+ WRITE_ONCE(root->last_log_commit, commit_id);
+}
+
+static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_trans);
+}
+
+static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid)
+{
+ WRITE_ONCE(root->last_trans, transid);
+}
+
+/*
+ * Return the generation this root started with.
+ *
+ * Every normal root that is created with root->root_key.offset set to it's
+ * originating generation. If it is a snapshot it is the generation when the
+ * snapshot was created.
+ *
+ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
+ * tree root. Thankfully we copy the root item of the owning tree root, which
+ * has it's last_snapshot set to what we would have root_key.offset set to, so
+ * return that if this is a TREE_RELOC root.
+ */
+static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
+{
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return btrfs_root_last_snapshot(&root->root_item);
+ return root->root_key.offset;
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -472,37 +509,12 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
-static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
-{
- return crc32c(crc, address, length);
-}
-
-static inline void btrfs_crc32c_final(u32 crc, u8 *result)
-{
- put_unaligned_le32(~crc, result);
-}
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
- return crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
- int len)
-{
- return (u64) crc32c(parent_objectid, name, len);
-}
-
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
@@ -515,13 +527,43 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/* Compare two keys in a memcmp fashion. */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
+ const struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+#endif
+
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -538,19 +580,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
enum btrfs_lock_nesting nest);
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 data_size);
+ const struct btrfs_path *path, u32 data_size);
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 new_size, int from_end);
+ const struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -571,13 +620,10 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_path *p, int find_higher,
int return_any);
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
+DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T))
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index e1475dfdf7a8..968dae953948 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include "ctree.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "transaction.h"
#include "locking.h"
#include "accessors.h"
@@ -46,8 +45,8 @@ struct inode_defrag {
u32 extent_thresh;
};
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
+static int compare_inode_defrag(const struct inode_defrag *defrag1,
+ const struct inode_defrag *defrag2)
{
if (defrag1->root > defrag2->root)
return 1;
@@ -62,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
}
/*
- * Pop a record for an inode into the defrag tree. The lock must be held
+ * Insert a record for an inode into the defrag tree. The lock must be held
* already.
*
* If you're inserting a record for an older transid than an existing record,
* the transid already in the tree is lowered.
- *
- * If an existing record is found the defrag item you pass in is freed.
*/
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
+static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
+ struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct inode_defrag *entry;
@@ -84,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(defrag, entry);
+ ret = compare_inode_defrag(defrag, entry);
if (ret < 0)
p = &parent->rb_left;
else if (ret > 0)
@@ -108,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
return 0;
}
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
@@ -120,35 +117,29 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
}
/*
- * Insert a defrag record for this inode if auto defrag is enabled.
+ * Insert a defrag record for this inode if auto defrag is enabled. No errors
+ * returned as they're not considered fatal.
*/
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
+void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode_defrag *defrag;
- u64 transid;
int ret;
- if (!__need_auto_defrag(fs_info))
- return 0;
+ if (!need_auto_defrag(fs_info))
+ return;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
-
- if (trans)
- transid = trans->transid;
- else
- transid = inode->root->last_trans;
+ return;
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
- return -ENOMEM;
+ return;
defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
- defrag->root = root->root_key.objectid;
+ defrag->transid = btrfs_get_root_last_trans(root);
+ defrag->root = btrfs_root_id(root);
defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
@@ -158,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
* and then re-read this inode, this new inode doesn't have
* IN_DEFRAG flag. At the case, we may find the existed defrag.
*/
- ret = __btrfs_add_inode_defrag(inode, defrag);
+ ret = btrfs_insert_inode_defrag(inode, defrag);
if (ret)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
}
/*
@@ -190,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(&tmp, entry);
+ ret = compare_inode_defrag(&tmp, entry);
if (ret < 0)
p = parent->rb_left;
else if (ret > 0)
@@ -199,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
goto out;
}
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -215,27 +205,24 @@ out:
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
- struct inode_defrag *defrag;
- struct rb_node *node;
+ struct inode_defrag *defrag, *next;
spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
+
+ rbtree_postorder_for_each_entry_safe(defrag, next,
+ &fs_info->defrag_inodes, rb_node)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- cond_resched_lock(&fs_info->defrag_inodes_lock);
+ fs_info->defrag_inodes = RB_ROOT;
- node = rb_first(&fs_info->defrag_inodes);
- }
spin_unlock(&fs_info->defrag_inodes_lock);
}
#define BTRFS_DEFRAG_BATCH 1024
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
+static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag,
+ struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
struct inode *inode;
@@ -246,7 +233,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
again:
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
goto cleanup;
- if (!__need_auto_defrag(fs_info))
+ if (!need_auto_defrag(fs_info))
goto cleanup;
/* Get the inode */
@@ -256,7 +243,7 @@ again:
goto cleanup;
}
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ inode = btrfs_iget(defrag->ino, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -274,9 +261,10 @@ again:
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
+ file_ra_state_init(ra, inode->i_mapping);
sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
iput(inode);
@@ -303,11 +291,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
atomic_inc(&fs_info->defrag_running);
while (1) {
+ struct file_ra_state ra = { 0 };
+
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
break;
- if (!__need_auto_defrag(fs_info))
+ if (!need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
@@ -325,7 +315,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
- __btrfs_run_defrag_inode(fs_info, defrag);
+ btrfs_run_defrag_inode(fs_info, defrag, &ra);
}
atomic_dec(&fs_info->defrag_running);
@@ -338,13 +328,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
/*
+ * Check if two blocks addresses are close, used by defrag.
+ */
+static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
+ return true;
+ if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
+ return true;
+ return false;
+}
+
+/*
+ * Go through all the leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order.
+ */
+static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u32 blocksize = fs_info->nodesize;
+ const int end_slot = btrfs_header_nritems(parent) - 1;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ int ret = 0;
+ bool progress_passed = false;
+
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+ parent->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_header_nritems(parent) <= 1)
+ return 0;
+
+ for (int i = start_slot; i <= end_slot; i++) {
+ struct extent_buffer *cur;
+ struct btrfs_disk_key disk_key;
+ u64 blocknr;
+ u64 other;
+ bool close = true;
+
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = true;
+ blocknr = btrfs_node_blockptr(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ ret = btrfs_force_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ return ret;
+}
+
+/*
* Defrag all the leaves in a given btree.
* Read all the leaves and try to get key order to
* better reflect disk order
*/
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -461,6 +556,45 @@ done:
}
/*
+ * Defrag a given btree. Every leaf in the btree is read and defragmented.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+ return 0;
+
+ while (1) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ ret = btrfs_defrag_leaves(trans, root);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ cond_resched();
+
+ if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
+ break;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ btrfs_debug(fs_info, "defrag_root cancelled");
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+ return ret;
+}
+
+/*
* Defrag specific helper to get an extent map.
*
* Differences between this and btrfs_get_extent() are:
@@ -564,8 +698,10 @@ iterate:
*/
if (key.offset > start) {
em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = 0;
+ em->offset = 0;
em->len = key.offset - start;
break;
}
@@ -627,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* We can get a merged extent, in that case, we need to re-search
* tree to get the original em for defrag.
*
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
+ * This is because even if we have adjacent extents that are contiguous
+ * and compatible (same type and flags), we still want to defrag them
+ * so that we use less metadata (extent items in the extent tree and
+ * file extent items in the inode's subvolume tree).
*/
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
- newer_than && em->generation >= newer_than) {
+ if (em && (em->flags & EXTENT_FLAG_MERGED)) {
free_extent_map(em);
em = NULL;
}
@@ -658,7 +794,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -666,7 +802,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
u32 extent_thresh, u64 newer_than, bool locked)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *next;
bool ret = false;
@@ -682,9 +818,9 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
*/
next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
/* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
/*
* If the next extent is at its max capacity, defragging current extent
@@ -717,20 +853,21 @@ out:
* NOTE: Caller should also wait for page writeback after the cluster is
* prepared, here we don't do writeback wait for each page.
*/
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
u64 page_start = (u64)index << PAGE_SHIFT;
u64 page_end = page_start + PAGE_SIZE - 1;
struct extent_state *cached_state = NULL;
- struct page *page;
+ struct folio *folio;
int ret;
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio))
+ return folio;
/*
* Since we can defragment files opened read-only, we can encounter
@@ -740,16 +877,16 @@ again:
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-ETXTBSY);
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(ret);
}
@@ -764,17 +901,17 @@ again:
if (!ordered)
break;
- unlock_page(page);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
- lock_page(page);
+ folio_lock(folio);
/*
- * We unlocked the page above, so we need check if it was
+ * We unlocked the folio above, so we need check if it was
* released or not.
*/
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
}
@@ -783,21 +920,21 @@ again:
* Now the page range has no ordered extent any more. Read the page to
* make it uptodate.
*/
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
}
- return page;
+ return folio;
}
struct defrag_target_range {
@@ -848,14 +985,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* This is for users who want to convert inline extents to
* regular ones through max_inline= mount option.
*/
- if (em->block_start == EXTENT_MAP_INLINE &&
+ if (em->disk_bytenr == EXTENT_MAP_INLINE &&
em->len <= inode->root->fs_info->max_inline)
goto next;
- /* Skip hole/delalloc/preallocated extents */
- if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ /* Skip holes and preallocated extents. */
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
+ (em->flags & EXTENT_FLAG_PREALLOC))
goto next;
/* Skip older extent */
@@ -891,8 +1027,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
+ if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
@@ -919,7 +1055,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* So if an inline extent passed all above checks, just add it
* for defrag, and be converted to regular extents.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
goto add;
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
@@ -1019,7 +1155,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
*/
static int defrag_one_locked_target(struct btrfs_inode *inode,
struct defrag_target_range *target,
- struct page **pages, int nr_pages,
+ struct folio **folios, int nr_pages,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1028,7 +1164,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
const u64 len = target->len;
unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
+ unsigned long first_index = folios[0]->index;
int ret = 0;
int i;
@@ -1045,8 +1181,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ folio_clear_checked(folios[i]);
+ btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1062,7 +1198,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
struct defrag_target_range *entry;
struct defrag_target_range *tmp;
LIST_HEAD(target_list);
- struct page **pages;
+ struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
u64 last_index = (start + len - 1) >> PAGE_SHIFT;
u64 start_index = start >> PAGE_SHIFT;
@@ -1073,21 +1209,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
+ folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
+ if (!folios)
return -ENOMEM;
/* Prepare all pages */
for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
+ folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ if (IS_ERR(folios[i])) {
+ ret = PTR_ERR(folios[i]);
+ nr_pages = i;
+ goto free_folios;
}
}
for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
+ folio_wait_writeback(folios[i]);
/* Lock the pages range */
lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
@@ -1107,7 +1243,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
goto unlock_extent;
list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
&cached_state);
if (ret < 0)
break;
@@ -1121,14 +1257,12 @@ unlock_extent:
unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
(last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
&cached_state);
-free_pages:
+free_folios:
for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folios[i]);
+ folio_put(folios[i]);
}
- kfree(pages);
+ kfree(folios);
return ret;
}
@@ -1174,8 +1308,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
if (entry->start + range_len <= *last_scanned_ret)
continue;
- if (ra)
- page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
((entry->start + range_len - 1) >> PAGE_SHIFT) -
(entry->start >> PAGE_SHIFT) + 1);
@@ -1207,7 +1340,7 @@ out:
* Entry point to file defragmentation.
*
* @inode: inode to be defragged
- * @ra: readahead state (can be NUL)
+ * @ra: readahead state
* @range: defrag options including range and flags
* @newer_than: minimum transid to defrag
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
@@ -1223,18 +1356,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
- bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
+ ASSERT(ra);
+
if (isize == 0)
return 0;
@@ -1264,18 +1398,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
/*
- * If we were not given a ra, allocate a readahead context. As
- * readahead is just an optimization, defrag will work without it so
- * we don't error out.
- */
- if (!ra) {
- ra_allocated = true;
- ra = kzalloc(sizeof(*ra), GFP_KERNEL);
- if (ra)
- file_ra_state_init(ra, inode->i_mapping);
- }
-
- /*
* Make writeback start from the beginning of the range, so that the
* defrag range can be written sequentially.
*/
@@ -1329,8 +1451,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
cond_resched();
}
- if (ra_allocated)
- kfree(ra);
/*
* Update range.start for autodefrag, this will indicate where to start
* in next run.
@@ -1369,9 +1489,7 @@ void __cold btrfs_auto_defrag_exit(void)
int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct inode_defrag), 0, 0, NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5305f2283b5e..6b7596c4f0dc 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -3,16 +3,25 @@
#ifndef BTRFS_DEFRAG_H
#define BTRFS_DEFRAG_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+struct inode;
+struct file_ra_state;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ioctl_defrag_range_args;
+
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh);
+void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 4a7aefa5f9cf..7aa8a395d838 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -6,9 +6,7 @@
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "space-info.h"
-#include "transaction.h"
#include "qgroup.h"
-#include "block-group.h"
#include "fs.h"
/*
@@ -113,7 +111,7 @@
* making error handling and cleanup easier.
*/
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -350,7 +348,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index c5d573f2366e..3f32953c0a80 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -3,9 +3,13 @@
#ifndef BTRFS_DELALLOC_SPACE_H
#define BTRFS_DELALLOC_SPACE_H
+#include <linux/types.h>
+
struct extent_changeset;
+struct btrfs_inode;
+struct btrfs_fs_info;
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len,
bool noflush);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 32c5f5a8a0e9..508bdbae29a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
- sizeof(struct btrfs_delayed_node),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0);
if (!delayed_node_cache)
return -ENOMEM;
return 0;
@@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void)
kmem_cache_destroy(delayed_node_cache);
}
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ atomic_set(&delayed_root->items_seq, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
static inline void btrfs_init_delayed_node(
struct btrfs_delayed_node *delayed_node,
struct btrfs_root *root, u64 inode_id)
@@ -70,22 +77,22 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return node;
}
- spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ xa_lock(&root->delayed_nodes);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -93,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -104,10 +111,10 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = NULL;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return NULL;
}
@@ -120,6 +127,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
int ret;
+ void *ptr;
again:
node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +139,30 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
+ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
+ /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+ if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
+ return ERR_PTR(-ENOMEM);
}
-
- spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
+ ptr = xa_load(&root->delayed_nodes, ino);
+ if (ptr) {
+ /* Somebody inserted it, go back and read it. */
+ xa_unlock(&root->delayed_nodes);
kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
+ node = NULL;
goto again;
}
+ ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ASSERT(xa_err(ptr) != -EINVAL);
+ ASSERT(xa_err(ptr) != -ENOMEM);
+ ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
- spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
+ xa_unlock(&root->delayed_nodes);
return node;
}
@@ -263,15 +275,12 @@ static void __btrfs_release_delayed_node(
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
- spin_lock(&root->inode_lock);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
/*
* Once our refcount goes to zero, nobody is allowed to bump it
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
- spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -328,7 +337,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
}
/*
- * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * Look up the delayed item by key.
+ *
* @delayed_node: pointer to the delayed node
* @index: the dir index value to lookup (offset of a dir index key)
*
@@ -515,7 +525,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
/*
* For insertions we track reserved metadata space by accounting
* for the number of leaves that will be used, based on the delayed
- * node's index_items_size field.
+ * node's curr_index_batch_size and index_item_leaves fields.
*/
if (item->type == BTRFS_DELAYED_DELETION_ITEM)
item->bytes_reserved = num_bytes;
@@ -1033,14 +1043,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf))
- goto search;
-again:
+ /*
+ * Now we're going to delete the INODE_REF/EXTREF, which should be the
+ * only one ref left. Check if the next item is an INODE_REF/EXTREF.
+ *
+ * But if we're the last item already, release and search for the last
+ * INODE_REF/EXTREF.
+ */
+ if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = (u64)-1;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret > 0);
+ ASSERT(path->slots[0] > 0);
+ ret = 0;
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ } else {
+ path->slots[0]++;
+ }
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != node->inode_id)
goto out;
-
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
goto out;
@@ -1067,22 +1096,6 @@ err_out:
btrfs_abort_transaction(trans, ret);
return ret;
-
-search:
- btrfs_release_path(path);
-
- key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = -1;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto err_out;
- ASSERT(ret);
-
- ret = 0;
- leaf = path->nodes[0];
- path->slots[0]--;
- goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1379,8 +1392,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
- NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1457,7 +1469,7 @@ static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1637,7 +1649,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->root_key.objectid,
+ index, btrfs_root_id(node->root),
node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
@@ -1670,7 +1682,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
return 0;
}
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list)
@@ -1678,7 +1690,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return false;
@@ -1686,8 +1698,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- btrfs_inode_lock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1718,7 +1730,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
return true;
}
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list)
{
@@ -1740,10 +1752,10 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
* The VFS is going to do up_read(), so we need to downgrade back to a
* read lock.
*/
- downgrade_write(&inode->i_rwsem);
+ downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index)
{
struct btrfs_delayed_item *curr;
@@ -1761,11 +1773,10 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
}
/*
- * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
- *
+ * Read dir info stored in the delayed tree.
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1835,24 +1846,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1892,22 +1901,21 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+ inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ btrfs_stack_timespec_nsec(&inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+ inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ btrfs_stack_timespec_nsec(&inode_item->mtime));
inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_stack_timespec_nsec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -1915,9 +1923,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
int ret = 0;
@@ -2041,34 +2049,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
- spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
- spin_unlock(&root->inode_lock);
- break;
+ struct btrfs_delayed_node *node;
+ int count;
+
+ xa_lock(&root->delayed_nodes);
+ if (xa_empty(&root->delayed_nodes)) {
+ xa_unlock(&root->delayed_nodes);
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ count = 0;
+ xa_for_each_start(&root->delayed_nodes, index, node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&node->refs)) {
+ delayed_nodes[count] = node;
+ count++;
+ }
+ if (count >= ARRAY_SIZE(delayed_nodes))
+ break;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
+ index++;
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1da213197f55..7cfefdfe54ea 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -7,15 +7,23 @@
#ifndef BTRFS_DELAYED_INODE_H
#define BTRFS_DELAYED_INODE_H
+#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/wait.h>
+#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include "ctree.h"
+struct btrfs_disk_key;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
enum btrfs_delayed_item_type {
BTRFS_DELAYED_INSERTION_ITEM,
BTRFS_DELAYED_DELETION_ITEM
@@ -98,22 +106,11 @@ struct btrfs_delayed_item {
char data[] __counted_by(data_len);
};
-static inline void btrfs_init_delayed_root(
- struct btrfs_delayed_root *delayed_root)
-{
- atomic_set(&delayed_root->items, 0);
- atomic_set(&delayed_root->items_seq, 0);
- delayed_root->nodes = 0;
- spin_lock_init(&delayed_root->lock);
- init_waitqueue_head(&delayed_root->wait);
- INIT_LIST_HEAD(&delayed_root->node_list);
- INIT_LIST_HEAD(&delayed_root->prepare_list);
-}
-
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -135,7 +132,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
@@ -147,17 +143,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list);
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6f2e48d697dd..cab94d141f66 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -16,8 +16,7 @@
#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
-struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_ref_node_cachep;
struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
@@ -57,16 +56,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
* Release a ref head's reservation.
*
* @fs_info: the filesystem
- * @nr: number of items to drop
+ * @nr_refs: number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
*
* Drops the delayed ref head's count from the delayed refs rsv and free any
* excess reservation we had.
*/
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
- u64 released = 0;
+ u64 num_bytes;
+ u64 released;
+
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
@@ -77,68 +80,118 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
/*
* Adjust the size of the delayed refs rsv.
*
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
*/
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
u64 num_bytes;
+ u64 reserved_bytes;
- if (!trans->delayed_ref_updates)
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+ trans->delayed_ref_csum_deletions);
+
+ if (num_bytes == 0)
return;
- num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- trans->delayed_ref_updates);
+ /*
+ * Try to take num_bytes from the transaction's local delayed reserve.
+ * If not possible, try to take as much as it's available. If the local
+ * reserve doesn't have enough reserved space, the delayed refs reserve
+ * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
+ * by someone or if a transaction commit is triggered before that, the
+ * global block reserve will be used. We want to minimize using the
+ * global block reserve for cases we can account for in advance, to
+ * avoid exhausting it and reach -ENOSPC during a transaction commit.
+ */
+ spin_lock(&local_rsv->lock);
+ reserved_bytes = min(num_bytes, local_rsv->reserved);
+ local_rsv->reserved -= reserved_bytes;
+ local_rsv->full = (local_rsv->reserved >= local_rsv->size);
+ spin_unlock(&local_rsv->lock);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = false;
+ delayed_rsv->reserved += reserved_bytes;
+ delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
+ trans->delayed_ref_csum_deletions = 0;
}
/*
- * Transfer bytes to our delayed refs rsv.
- *
- * @fs_info: the filesystem
- * @num_bytes: number of bytes to transfer
- *
- * This transfers up to the num_bytes amount, previously reserved, to the
- * delayed_refs_rsv. Any extra bytes are returned to the space info.
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * insertion, used after allocating a block group.
*/
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes)
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- u64 to_free = 0;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
- u64 delta = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- if (num_bytes > delta) {
- to_free = num_bytes - delta;
- num_bytes = delta;
- }
- } else {
- to_free = num_bytes;
- num_bytes = 0;
- }
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Inserting a block group item does not require changing the free space
+ * tree, only the extent tree or the block group tree, so this is all we
+ * need.
+ */
+ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
- if (num_bytes)
- delayed_refs_rsv->reserved += num_bytes;
- if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = true;
- spin_unlock(&delayed_refs_rsv->lock);
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item insertion.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * update.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Updating a block group item does not result in new nodes/leaves and
+ * does not require changing the free space tree, only the extent tree
+ * or the block group tree, so this is all we need.
+ */
+ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
- if (num_bytes)
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item update.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
- if (to_free)
- btrfs_space_info_free_bytes_may_use(fs_info,
- delayed_refs_rsv->space_info, to_free);
+ 0, released, 0);
}
/*
@@ -154,6 +207,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *space_info = block_rsv->space_info;
u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
u64 num_bytes = 0;
u64 refilled_bytes;
@@ -170,7 +224,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
if (ret)
return ret;
@@ -199,8 +253,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
if (to_free > 0)
- btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info,
- to_free);
+ btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
if (refilled_bytes > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -209,50 +262,19 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
}
/*
- * compare two delayed tree backrefs with same bytenr and type
- */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
- struct btrfs_delayed_tree_ref *ref2)
-{
- if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
- return 0;
-}
-
-/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
- struct btrfs_delayed_data_ref *ref2)
+static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
+ struct btrfs_delayed_ref_node *ref2)
{
- if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- if (ref1->objectid < ref2->objectid)
- return -1;
- if (ref1->objectid > ref2->objectid)
- return 1;
- if (ref1->offset < ref2->offset)
- return -1;
- if (ref1->offset > ref2->offset)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
+ if (ref1->data_ref.objectid < ref2->data_ref.objectid)
+ return -1;
+ if (ref1->data_ref.objectid > ref2->data_ref.objectid)
+ return 1;
+ if (ref1->data_ref.offset < ref2->data_ref.offset)
+ return -1;
+ if (ref1->data_ref.offset > ref2->data_ref.offset)
+ return 1;
return 0;
}
@@ -266,13 +288,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return -1;
if (ref1->type > ref2->type)
return 1;
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
- btrfs_delayed_node_to_tree_ref(ref2));
- else
- ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
- btrfs_delayed_node_to_data_ref(ref2));
+ if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ } else {
+ if (ref1->ref_root < ref2->ref_root)
+ return -1;
+ if (ref1->ref_root > ref2->ref_root)
+ return 1;
+ if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
+ ret = comp_data_refs(ref1, ref2);
+ }
if (ret)
return ret;
if (check_seq) {
@@ -422,7 +451,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}
-static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
@@ -433,9 +463,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
-static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
@@ -464,10 +496,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
mod = -next->ref_mod;
}
- drop_delayed_ref(delayed_refs, head, next);
+ drop_delayed_ref(fs_info, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
- drop_delayed_ref(delayed_refs, head, ref);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
done = true;
} else {
/*
@@ -505,7 +537,7 @@ again:
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
- if (merge_ref(delayed_refs, head, ref, seq))
+ if (merge_ref(fs_info, delayed_refs, head, ref, seq))
goto again;
}
}
@@ -584,10 +616,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return true if the ref was merged into an existing one (and therefore can be
* freed by the caller).
*/
-static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
+ struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
struct btrfs_delayed_ref_node *exist;
int mod;
@@ -598,6 +631,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
+ trans->delayed_ref_updates++;
return false;
}
@@ -626,7 +660,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
- drop_delayed_ref(root, href, exist);
+ drop_delayed_ref(trans->fs_info, root, href, exist);
spin_unlock(&href->lock);
return true;
}
@@ -647,6 +681,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
BUG_ON(existing->is_data != update->is_data);
spin_lock(&existing->lock);
+
+ /*
+ * When freeing an extent, we may not know the owning root when we
+ * first create the head_ref. However, some deref before the last deref
+ * will know it, so we just need to update the head_ref accordingly.
+ */
+ if (!existing->owning_root)
+ existing->owning_root = update->owning_root;
+
if (update->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -656,6 +699,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* Set it again here
*/
existing->must_insert_reserved = update->must_insert_reserved;
+ existing->owning_root = update->owning_root;
/*
* update the num_bytes so we make sure the accounting
@@ -695,6 +739,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
+ * We reserve bytes for csum deletion when adding or updating a ref head
+ * see add_delayed_ref_head() for more details.
*/
if (existing->is_data) {
u64 csum_leaves =
@@ -703,11 +749,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
delayed_refs->pending_csums -= existing->num_bytes;
- btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
}
if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
delayed_refs->pending_csums += existing->num_bytes;
- trans->delayed_ref_updates += csum_leaves;
+ trans->delayed_ref_csum_deletions += csum_leaves;
}
}
@@ -715,18 +761,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_ref *generic_ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- u64 reserved, int action, bool is_data,
- bool is_system)
+ u64 reserved)
{
int count_mod = 1;
bool must_insert_reserved = false;
/* If reserved is provided, it must be a data extent. */
- BUG_ON(!is_data && reserved);
+ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved);
- switch (action) {
+ switch (generic_ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ /* count_mod is already set to 1. */
+ break;
case BTRFS_UPDATE_DELAYED_HEAD:
count_mod = 0;
break;
@@ -755,12 +803,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
}
refcount_set(&head_ref->refs, 1);
- head_ref->bytenr = bytenr;
- head_ref->num_bytes = num_bytes;
+ head_ref->bytenr = generic_ref->bytenr;
+ head_ref->num_bytes = generic_ref->num_bytes;
head_ref->ref_mod = count_mod;
+ head_ref->reserved_bytes = reserved;
head_ref->must_insert_reserved = must_insert_reserved;
- head_ref->is_data = is_data;
- head_ref->is_system = is_system;
+ head_ref->owning_root = generic_ref->owning_root;
+ head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA);
+ head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
@@ -769,13 +819,19 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
+ /* If not metadata set an impossible level to help debugging. */
+ if (generic_ref->type == BTRFS_REF_METADATA)
+ head_ref->level = generic_ref->tree_ref.level;
+ else
+ head_ref->level = U8_MAX;
+
if (qrecord) {
- if (ref_root && reserved) {
+ if (generic_ref->ref_root && reserved) {
qrecord->data_rsv = reserved;
- qrecord->data_rsv_refroot = ref_root;
+ qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = bytenr;
- qrecord->num_bytes = num_bytes;
+ qrecord->bytenr = generic_ref->bytenr;
+ qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
}
@@ -784,6 +840,8 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
* helper function to actually insert a head node into the rbtree.
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
+ *
+ * Returns an error pointer in case of an error.
*/
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
@@ -791,6 +849,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord,
int action, bool *qrecord_inserted_ret)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
bool qrecord_inserted = false;
@@ -799,14 +858,23 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
/* Record qgroup extent info if provided */
if (qrecord) {
- if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
- delayed_refs, qrecord))
+ int ret;
+
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord);
+ if (ret) {
+ /* Clean up if insertion fails or item exists. */
+ xa_release(&delayed_refs->dirty_extents,
+ qrecord->bytenr >> fs_info->sectorsize_bits);
+ /* Caller responsible for freeing qrecord on error. */
+ if (ret < 0)
+ return ERR_PTR(ret);
kfree(qrecord);
- else
+ } else {
qrecord_inserted = true;
+ }
}
- trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
+ trace_add_delayed_ref_head(fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
@@ -819,16 +887,20 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ /*
+ * We reserve the amount of bytes needed to delete csums when
+ * adding the ref head and not when adding individual drop refs
+ * since the csum items are deleted only after running the last
+ * delayed drop ref (the data extent's ref count drops to 0).
+ */
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
- trans->delayed_ref_updates +=
- btrfs_csum_bytes_to_leaves(trans->fs_info,
- head_ref->num_bytes);
+ trans->delayed_ref_csum_deletions +=
+ btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes);
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -837,8 +909,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
/*
- * init_delayed_ref_common - Initialize the structure which represents a
- * modification to a an extent.
+ * Initialize the structure which represents a modification to a an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -863,90 +934,111 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
*/
static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- int action, u8 ref_type)
+ struct btrfs_ref *generic_ref)
{
+ int action = generic_ref->action;
u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(ref_root))
+ if (is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
+ ref->bytenr = generic_ref->bytenr;
+ ref->num_bytes = generic_ref->num_bytes;
ref->ref_mod = 1;
ref->action = action;
ref->seq = seq;
- ref->type = ref_type;
+ ref->type = btrfs_ref_type(generic_ref);
+ ref->ref_root = generic_ref->ref_root;
+ ref->parent = generic_ref->parent;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
+
+ if (generic_ref->type == BTRFS_REF_DATA)
+ ref->data_ref = generic_ref->data_ref;
+ else
+ ref->tree_ref = generic_ref->tree_ref;
}
-/*
- * add a delayed tree ref. This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op)
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->tree_ref.level = level;
+ generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
+}
+
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->data_ref.objectid = ino;
+ generic_ref->data_ref.offset = offset;
+ generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+}
+
+static int add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op,
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_head *new_head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
bool qrecord_inserted;
- bool is_system;
- bool merged;
int action = generic_ref->action;
- int level = generic_ref->tree_ref.level;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u8 ref_type;
-
- is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+ bool merged;
+ int ret;
- ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
- if (!ref)
+ node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS);
+ if (!node)
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto free_node;
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto free_head_ref;
+ }
+ if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
+ generic_ref->bytenr >> fs_info->sectorsize_bits,
+ GFP_NOFS)) {
+ ret = -ENOMEM;
+ goto free_record;
}
}
- if (parent)
- ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
- else
- ref_type = BTRFS_TREE_BLOCK_REF_KEY;
-
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, action,
- ref_type);
- ref->root = generic_ref->tree_ref.owning_root;
- ref->parent = parent;
- ref->level = level;
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, 0, action,
- false, is_system);
+ init_delayed_ref_common(fs_info, node, generic_ref);
+ init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -956,10 +1048,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted);
+ new_head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted);
+ if (IS_ERR(new_head_ref)) {
+ spin_unlock(&delayed_refs->lock);
+ ret = PTR_ERR(new_head_ref);
+ goto free_record;
+ }
+ head_ref = new_head_ref;
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, node);
spin_unlock(&delayed_refs->lock);
/*
@@ -968,16 +1066,36 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
btrfs_update_delayed_refs_rsv(trans);
- trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
+ if (generic_ref->type == BTRFS_REF_DATA)
+ trace_add_delayed_data_ref(trans->fs_info, node);
+ else
+ trace_add_delayed_tree_ref(trans->fs_info, node);
if (merged)
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(trans, record);
-
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
+
+free_record:
+ kfree(record);
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_node:
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
+ return ret;
+}
+
+/*
+ * Add a delayed tree ref. This does all of the accounting required to make sure
+ * the delayed ref is eventually processed before this transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, extent_op, 0);
}
/*
@@ -987,114 +1105,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_data_ref *ref;
- struct btrfs_delayed_ref_head *head_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_qgroup_extent_record *record = NULL;
- bool qrecord_inserted;
- int action = generic_ref->action;
- bool merged;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.owning_root;
- u64 owner = generic_ref->data_ref.ino;
- u64 offset = generic_ref->data_ref.offset;
- u8 ref_type;
-
- ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
- ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- if (parent)
- ref_type = BTRFS_SHARED_DATA_REF_KEY;
- else
- ref_type = BTRFS_EXTENT_DATA_REF_KEY;
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- ref_root, action, ref_type);
- ref->root = ref_root;
- ref->parent = parent;
- ref->objectid = owner;
- ref->offset = offset;
-
-
- head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- return -ENOMEM;
- }
-
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
- record = kzalloc(sizeof(*record), GFP_NOFS);
- if (!record) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- kmem_cache_free(btrfs_delayed_ref_head_cachep,
- head_ref);
- return -ENOMEM;
- }
- }
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
- head_ref->extent_op = NULL;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
-
- /*
- * insert both the head node and the new ref without dropping
- * the spin lock
- */
- head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted);
-
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * Need to update the delayed_refs_rsv with any changes we may have
- * made.
- */
- btrfs_update_delayed_refs_rsv(trans);
-
- trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
- if (merged)
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
-
-
- if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
- return 0;
+ ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, NULL, reserved);
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_head *head_ref_ret;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_ref generic_ref = {
+ .type = BTRFS_REF_METADATA,
+ .action = BTRFS_UPDATE_DELAYED_HEAD,
+ .bytenr = bytenr,
+ .num_bytes = num_bytes,
+ .tree_ref.level = level,
+ };
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
- init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, false, false);
+ init_delayed_ref_head(head_ref, &generic_ref, NULL, 0);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
- NULL);
-
+ head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
+ BTRFS_UPDATE_DELAYED_HEAD, NULL);
spin_unlock(&delayed_refs->lock);
+ if (IS_ERR(head_ref_ret)) {
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ return PTR_ERR(head_ref_ret);
+ }
+
/*
* Need to update the delayed_refs_rsv with any changes we may have
* made.
@@ -1103,6 +1151,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return 0;
}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ if (refcount_dec_and_test(&ref->refs)) {
+ WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, ref);
+ }
+}
+
/*
* This does a simple search for the head node for a given extent. Returns the
* head node if found, or NULL if not.
@@ -1115,41 +1171,91 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
return find_ref_head(delayed_refs, bytenr, false);
}
+static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
+{
+ int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY;
+
+ if (type < entry->type)
+ return -1;
+ if (type > entry->type)
+ return 1;
+
+ if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (root < entry->ref_root)
+ return -1;
+ if (root > entry->ref_root)
+ return 1;
+ } else {
+ if (parent < entry->parent)
+ return -1;
+ if (parent > entry->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check to see if a given root/parent reference is attached to the head. This
+ * only checks for BTRFS_ADD_DELAYED_REF references that match, as that
+ * indicates the reference exists for the given root or parent. This is for
+ * tree blocks only.
+ *
+ * @head: the head of the bytenr we're searching.
+ * @root: the root objectid of the reference if it is a normal reference.
+ * @parent: the parent if this is a shared backref.
+ */
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+ u64 root, u64 parent)
+{
+ struct rb_node *node;
+ bool found = false;
+
+ lockdep_assert_held(&head->mutex);
+
+ spin_lock(&head->lock);
+ node = head->ref_tree.rb_root.rb_node;
+ while (node) {
+ struct btrfs_delayed_ref_node *entry;
+ int ret;
+
+ entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ ret = find_comp(entry, root, parent);
+ if (ret < 0) {
+ node = node->rb_left;
+ } else if (ret > 0) {
+ node = node->rb_right;
+ } else {
+ /*
+ * We only want to count ADD actions, as drops mean the
+ * ref doesn't exist.
+ */
+ if (entry->action == BTRFS_ADD_DELAYED_REF)
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&head->lock);
+ return found;
+}
+
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
- kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
- kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_ref_node_cachep);
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
int __init btrfs_delayed_ref_init(void)
{
- btrfs_delayed_ref_head_cachep = kmem_cache_create(
- "btrfs_delayed_ref_head",
- sizeof(struct btrfs_delayed_ref_head), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
- btrfs_delayed_tree_ref_cachep = kmem_cache_create(
- "btrfs_delayed_tree_ref",
- sizeof(struct btrfs_delayed_tree_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_tree_ref_cachep)
- goto fail;
-
- btrfs_delayed_data_ref_cachep = kmem_cache_create(
- "btrfs_delayed_data_ref",
- sizeof(struct btrfs_delayed_data_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_data_ref_cachep)
+ btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
+ if (!btrfs_delayed_ref_node_cachep)
goto fail;
- btrfs_delayed_extent_op_cachep = kmem_cache_create(
- "btrfs_delayed_extent_op",
- sizeof(struct btrfs_delayed_extent_op), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index fd9bf2b709c0..352921e76c74 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -6,13 +6,55 @@
#ifndef BTRFS_DELAYED_REF_H
#define BTRFS_DELAYED_REF_H
+#include <linux/types.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/* these are the possible values of struct btrfs_delayed_ref_node->action */
-#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+enum btrfs_delayed_ref_action {
+ /* Add one backref to the tree */
+ BTRFS_ADD_DELAYED_REF = 1,
+ /* Delete one backref from the tree */
+ BTRFS_DROP_DELAYED_REF,
+ /* Record a full extent allocation */
+ BTRFS_ADD_DELAYED_EXTENT,
+ /* Not changing ref count on head ref */
+ BTRFS_UPDATE_DELAYED_HEAD,
+} __packed;
+
+struct btrfs_data_ref {
+ /* For EXTENT_DATA_REF */
+
+ /* Inode which refers to this data extent */
+ u64 objectid;
+
+ /*
+ * file_offset - extent_offset
+ *
+ * file_offset is the key.offset of the EXTENT_DATA key.
+ * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+ */
+ u64 offset;
+};
+
+struct btrfs_tree_ref {
+ /*
+ * Level of this tree block.
+ *
+ * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+ */
+ int level;
+
+ /* For non-skinny metadata, no special member needed */
+};
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
@@ -32,6 +74,15 @@ struct btrfs_delayed_ref_node {
/* seq number to keep track of insertion order */
u64 seq;
+ /* The ref_root for this ref */
+ u64 ref_root;
+
+ /*
+ * The parent for this ref, if this isn't set the ref_root is the
+ * reference owner.
+ */
+ u64 parent;
+
/* ref count on this data structure */
refcount_t refs;
@@ -48,11 +99,15 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+
+ union {
+ struct btrfs_tree_ref tree_ref;
+ struct btrfs_data_ref data_ref;
+ };
};
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
- u8 level;
bool update_key;
bool update_flags;
u64 flags_to_set;
@@ -105,6 +160,21 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
+ * The root that triggered the allocation when must_insert_reserved is
+ * set to true.
+ */
+ u64 owning_root;
+
+ /*
+ * Track reserved bytes when setting must_insert_reserved. On success
+ * or cleanup, we will need to free the reservation.
+ */
+ u64 reserved_bytes;
+
+ /* Tree block level, for metadata only. */
+ u8 level;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -117,26 +187,12 @@ struct btrfs_delayed_ref_head {
* the free has happened.
*/
bool must_insert_reserved;
+
bool is_data;
bool is_system;
bool processing;
};
-struct btrfs_delayed_tree_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- int level;
-};
-
-struct btrfs_delayed_data_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- u64 objectid;
- u64 offset;
-};
-
enum btrfs_delayed_ref_flags {
/* Indicate that we are flushing delayed refs for the commit */
BTRFS_DELAYED_REFS_FLUSHING,
@@ -146,8 +202,16 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
- /* dirty extent records */
- struct rb_root dirty_extent_root;
+ /*
+ * Track dirty extent records.
+ * The keys correspond to the logical address of the extent ("bytenr")
+ * right shifted by fs_info->sectorsize_bits. This is both to get a more
+ * dense index space (optimizes xarray structure) and because indexes in
+ * xarrays are of "unsigned long" type, meaning they are 32 bits wide on
+ * 32 bits platforms, limiting the extent range to 4G which is too low
+ * and makes it unusable (truncated index values) on 32 bits platforms.
+ */
+ struct xarray dirty_extents;
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -183,47 +247,11 @@ enum btrfs_ref_type {
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
BTRFS_REF_LAST,
-};
-
-struct btrfs_data_ref {
- /* For EXTENT_DATA_REF */
-
- /* Original root this data extent belongs to */
- u64 owning_root;
-
- /* Inode which refers to this data extent */
- u64 ino;
-
- /*
- * file_offset - extent_offset
- *
- * file_offset is the key.offset of the EXTENT_DATA key.
- * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
- */
- u64 offset;
-};
-
-struct btrfs_tree_ref {
- /*
- * Level of this tree block
- *
- * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
- */
- int level;
-
- /*
- * Root which owns this tree block.
- *
- * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
- */
- u64 owning_root;
-
- /* For non-skinny metadata, no special member needed */
-};
+} __packed;
struct btrfs_ref {
enum btrfs_ref_type type;
- int action;
+ enum btrfs_delayed_ref_action action;
/*
* Whether this extent should go through qgroup record.
@@ -238,7 +266,14 @@ struct btrfs_ref {
u64 real_root;
#endif
u64 bytenr;
- u64 len;
+ u64 num_bytes;
+ u64 owning_root;
+
+ /*
+ * The root that owns the reference for this reference, this will be set
+ * or ->parent will be set, depending on what type of reference this is.
+ */
+ u64 ref_root;
/* Bytenr of the parent tree block */
u64 parent;
@@ -249,8 +284,7 @@ struct btrfs_ref {
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
-extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_ref_node_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
@@ -277,51 +311,21 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
return num_bytes;
}
-static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
- int action, u64 bytenr, u64 len, u64 parent)
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+ int num_csum_items)
{
- generic_ref->action = action;
- generic_ref->bytenr = bytenr;
- generic_ref->len = len;
- generic_ref->parent = parent;
+ /*
+ * Deleting csum items does not result in new nodes/leaves and does not
+ * require changing the free space tree, only the csum tree, so this is
+ * all we need.
+ */
+ return btrfs_calc_metadata_size(fs_info, num_csum_items);
}
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root, u64 mod_root, bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: root;
-#endif
- generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.owning_root = root;
- generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-
-}
-
-static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset, u64 mod_root,
- bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: ref_root;
-#endif
- generic_ref->data_ref.owning_root = ref_root;
- generic_ref->data_ref.ino = ino;
- generic_ref->data_ref.offset = offset;
- generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(ref_root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-}
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup);
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup);
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
@@ -336,25 +340,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
}
-static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
-{
- WARN_ON(refcount_read(&ref->refs) == 0);
- if (refcount_dec_and_test(&ref->refs)) {
- WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
- switch (ref->type) {
- case BTRFS_TREE_BLOCK_REF_KEY:
- case BTRFS_SHARED_BLOCK_REF_KEY:
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- break;
- case BTRFS_EXTENT_DATA_REF_KEY:
- case BTRFS_SHARED_DATA_REF_KEY:
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- break;
- default:
- BUG();
- }
- }
-}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref);
static inline u64 btrfs_ref_head_to_space_flags(
struct btrfs_delayed_ref_head *head_ref)
@@ -379,7 +365,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
@@ -402,27 +388,51 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+ u64 root, u64 parent);
-/*
- * helper functions to cast a node into its container
- */
-static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
- return container_of(node, struct btrfs_delayed_tree_ref, node);
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.objectid;
+ return node->tree_ref.level;
}
-static inline struct btrfs_delayed_data_ref *
-btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
{
- return container_of(node, struct btrfs_delayed_data_ref, node);
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.offset;
+ return 0;
+}
+
+static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
+{
+ ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
+
+ if (ref->type == BTRFS_REF_DATA) {
+ if (ref->parent)
+ return BTRFS_SHARED_DATA_REF_KEY;
+ else
+ return BTRFS_EXTENT_DATA_REF_KEY;
+ } else {
+ if (ref->parent)
+ return BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ return BTRFS_TREE_BLOCK_REF_KEY;
+ }
+
+ return 0;
}
#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f77ef719a3b1..604399e59a3d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -11,13 +11,10 @@
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
@@ -247,6 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,12 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev)) {
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev);
+ return PTR_ERR(bdev_file);
}
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -313,11 +312,11 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
@@ -334,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ fput(bdev_file);
return ret;
}
@@ -549,8 +548,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
@@ -566,9 +564,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
spin_unlock(&cache->lock);
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- ASSERT(!IS_ERR(em));
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(map));
num_extents = 0;
cur_extent = 0;
@@ -582,7 +579,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
cur_extent = i;
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
@@ -688,7 +685,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* Commit dev_replace state and reserve 1 item for it.
@@ -828,25 +825,46 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ struct rb_node *node;
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
+ /*
+ * The chunk mutex must be held so that no new chunks can be created
+ * while we are updating existing chunks. This guarantees we don't miss
+ * any new chunk that gets created for a range that falls before the
+ * range of the last chunk we processed.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ write_lock(&fs_info->mapping_tree_lock);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ while (node) {
+ struct rb_node *next = rb_next(node);
+ struct btrfs_chunk_map *map;
+ u64 next_start;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ next_start = map->start + map->chunk_len;
+
+ for (int i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+
+ if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) {
+ map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX);
+ if (!map)
+ break;
+ node = &map->rb_node;
+ /*
+ * Drop the lookup reference since we are holding the
+ * lock in write mode and no one can remove the chunk
+ * map from the tree and drop its tree reference.
+ */
+ btrfs_free_chunk_map(map);
+ } else {
+ node = next;
+ }
+ }
+ write_unlock(&fs_info->mapping_tree_lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -886,7 +904,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* We have to use this loop approach because at this point src_device
@@ -1005,8 +1023,7 @@ error:
btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
- btrfs_scratch_superblocks(fs_info, src_device->bdev,
- src_device->name->str);
+ btrfs_scratch_superblocks(fs_info, src_device);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 675082ccec89..23e480efe5e6 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -6,11 +6,15 @@
#ifndef BTRFS_DEV_REPLACE_H
#define BTRFS_DEV_REPLACE_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
struct btrfs_ioctl_dev_replace_args;
struct btrfs_fs_info;
struct btrfs_trans_handle;
struct btrfs_dev_replace;
struct btrfs_block_group;
+struct btrfs_device;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7066414be6ee..1e8cd7c9472e 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -22,7 +22,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
*trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_key *cpu_key,
+ const struct btrfs_key *cpu_key,
u32 data_size,
const char *name,
int name_len)
@@ -108,7 +108,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index)
+ const struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
@@ -379,7 +379,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* for a specific name.
*/
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
@@ -417,7 +417,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di)
+ const struct btrfs_dir_item *di)
{
struct extent_buffer *leaf;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index aab4b7cc7fa0..5f6dfafc91f1 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,11 +3,21 @@
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
+#include <linux/types.h>
+#include <linux/crc32c.h>
+
+struct fscrypt_str;
+struct btrfs_fs_info;
+struct btrfs_key;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index);
+ const struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
@@ -23,7 +33,7 @@ struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di);
+ const struct btrfs_dir_item *di);
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -35,8 +45,13 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name,
int name_len);
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
#endif
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
new file mode 100644
index 000000000000..71984d7db839
--- /dev/null
+++ b/fs/btrfs/direct-io.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/fsverity.h>
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "direct-io.h"
+#include "extent-tree.h"
+#include "file.h"
+#include "fs.h"
+#include "transaction.h"
+#include "volumes.h"
+
+struct btrfs_dio_data {
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ struct btrfs_ordered_extent *ordered;
+ bool data_space_reserved;
+ bool nocow_done;
+};
+
+struct btrfs_dio_private {
+ /* Range of I/O */
+ u64 file_offset;
+ u32 bytes;
+
+ /* This must be last */
+ struct btrfs_bio bbio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
+{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ /* Direct lock must be taken before the extent lock. */
+ if (nowait) {
+ if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
+ return -EAGAIN;
+ } else {
+ lock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ }
+
+ while (1) {
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend,
+ cached_state)) {
+ ret = -EAGAIN;
+ break;
+ }
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure there's no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered &&
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
+ break;
+
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
+
+ if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
+ /*
+ * If we are doing a DIO read and the ordered extent we
+ * found is for a buffered write, we can not wait for it
+ * to complete and retry, because if we do so we can
+ * deadlock with concurrent buffered writes on page
+ * locks. This happens only if our DIO read covers more
+ * than one extent map, if at this point has already
+ * created an ordered extent for a previous extent map
+ * and locked its range in the inode's io tree, and a
+ * concurrent write against that previous extent map's
+ * range and this range started (we unlock the ranges
+ * in the io tree only when the bios complete and
+ * buffered writes always lock pages before attempting
+ * to lock range in the io tree).
+ */
+ if (writing ||
+ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+ btrfs_start_ordered_extent(ordered);
+ else
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /*
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readahead (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readahead wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
+ */
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ }
+
+ if (ret)
+ break;
+
+ cond_resched();
+ }
+
+ if (ret)
+ unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ return ret;
+}
+
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 start,
+ const struct btrfs_file_extent *file_extent,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ struct btrfs_ordered_extent *ordered;
+
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = btrfs_create_io_em(inode, start, file_extent, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
+ (1U << type) |
+ (1U << BTRFS_ORDERED_DIRECT));
+ if (IS_ERR(ordered)) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_map_range(inode, start,
+ start + file_extent->num_bytes - 1, false);
+ }
+ em = ERR_CAST(ordered);
+ } else {
+ ASSERT(!dio_data->ordered);
+ dio_data->ordered = ordered;
+ }
+ out:
+
+ return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
+again:
+ ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+ 0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
+ if (ret)
+ return ERR_PTR(ret);
+
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+ em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(em))
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
+
+ return em;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 *lenp,
+ unsigned int iomap_flags)
+{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em = *map;
+ int type;
+ u64 block_start;
+ struct btrfs_block_group *bg;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 len = *lenp;
+ u64 prev_len;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->disk_bytenr != EXTENT_MAP_HOLE)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC)
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = extent_map_block_start(em) + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len,
+ &file_extent, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
+ }
+
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
+
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
+ goto out;
+ }
+ space_reserved = true;
+
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
+ &file_extent, type);
+ btrfs_dec_nocow_writers(bg);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em2;
+ em = em2;
+ }
+
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+
+ dio_data->nocow_done = true;
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ if (nowait) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
+ }
+
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ }
+ *lenp = len;
+ return ret;
+}
+
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
+ u64 lockstart, lockend;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+ u64 len = length;
+ const u64 data_alloc_len = length;
+ u32 unlock_bits = EXTENT_LOCKED;
+
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
+ if (!write)
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ memset(dio_data, 0, sizeof(*dio_data));
+
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
+ */
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
+ goto err;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
+ goto unlock_err;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, &len, flags);
+ if (ret < 0)
+ goto unlock_err;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
+ }
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ iomap->length = len;
+ free_extent_map(em);
+
+ /*
+ * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
+ * writes only hold it for this part. We hold the extent lock until
+ * we're completely done with the extent map to make sure it remains
+ * valid.
+ */
+ if (write)
+ unlock_bits |= EXTENT_DIO_LOCKED;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, &cached_state);
+
+ /* We didn't use everything, unlock the dio extent for the remainder. */
+ if (!write && (start + len) < lockend)
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+ lockend, NULL);
+
+ return 0;
+
+unlock_err:
+ /*
+ * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
+ * to update this, be explicit that we expect EXTENT_LOCKED and
+ * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
+err:
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ return 0;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ pos, length, false);
+ else
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ ret = -ENOTBLK;
+ }
+ if (write) {
+ btrfs_put_ordered_extent(dio_data->ordered);
+ dio_data->ordered = NULL;
+ }
+
+ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+ return ret;
+}
+
+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+{
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_inode *inode = bbio->inode;
+ struct bio *bio = &bbio->bio;
+
+ if (bio->bi_status) {
+ btrfs_warn(inode->root->fs_info,
+ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
+ btrfs_ino(inode), bio->bi_opf,
+ dip->file_offset, dip->bytes, bio->bi_status);
+ }
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ btrfs_finish_ordered_extent(bbio->ordered, NULL,
+ dip->file_offset, dip->bytes,
+ !bio->bi_status);
+ } else {
+ unlock_dio_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
+ }
+
+ bbio->bio.bi_private = bbio->private;
+ iomap_dio_bio_end_io(bio);
+}
+
+static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered)
+{
+ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bbio->bio.bi_iter.bi_size;
+ struct btrfs_ordered_extent *new;
+ int ret;
+
+ /* Must always be called for the beginning of an ordered extent. */
+ if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+ return -EINVAL;
+
+ /* No need to split if the ordered extent covers the entire bio. */
+ if (ordered->disk_num_bytes == len) {
+ refcount_inc(&ordered->refs);
+ bbio->ordered = ordered;
+ return 0;
+ }
+
+ /*
+ * Don't split the extent_map for NOCOW extents, as we're writing into
+ * a pre-existing one.
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
+ if (ret)
+ return ret;
+ }
+
+ new = btrfs_split_ordered_extent(ordered, len);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ bbio->ordered = new;
+ return 0;
+}
+
+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_dio_data *dio_data = iter->private;
+
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_dio_end_io, bio->bi_private);
+ bbio->inode = BTRFS_I(iter->inode);
+ bbio->file_offset = file_offset;
+
+ dip->file_offset = file_offset;
+ dip->bytes = bio->bi_iter.bi_size;
+
+ dio_data->submitted += bio->bi_iter.bi_size;
+
+ /*
+ * Check if we are doing a partial write. If we are, we need to split
+ * the ordered extent to match the submitted bio. Hang on to the
+ * remaining unfinishable ordered_extent in dio_data so that it can be
+ * cancelled in iomap_end to avoid a deadlock wherein faulting the
+ * remaining pages is blocked on the outstanding ordered extent.
+ */
+ if (iter->flags & IOMAP_WRITE) {
+ int ret;
+
+ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+ if (ret) {
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ file_offset, dip->bytes,
+ !ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ iomap_dio_bio_end_io(bio);
+ return;
+ }
+ }
+
+ btrfs_submit_bbio(bbio, 0);
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_dio_submit_io,
+ .bio_set = &btrfs_dio_bioset,
+};
+
+static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ loff_t pos;
+ ssize_t written = 0;
+ ssize_t written_buffered;
+ size_t prev_left = 0;
+ loff_t endbyte;
+ ssize_t ret;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ return ret;
+ }
+
+ ret = btrfs_write_check(iocb, from, ret);
+ if (ret < 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
+
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+again:
+ from->nofault = true;
+ dio = btrfs_dio_write(iocb, from, written);
+ from->nofault = false;
+
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ } else {
+ /*
+ * If we have a synchronous write, we must make sure the fsync
+ * triggered by the iomap_dio_complete() call below doesn't
+ * deadlock on the inode lock - we are already holding it and we
+ * can't call it after unlocking because we may need to complete
+ * partial writes due to the input buffer (or parts of it) not
+ * being already faulted in.
+ */
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
+ ret = iomap_dio_complete(dio);
+ current->journal_info = NULL;
+ }
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ written = ret;
+
+ if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ ret = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
+ /*
+ * If 'ret' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
+ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+
+buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ written_buffered = btrfs_buffered_write(iocb, from);
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out;
+ }
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
+ endbyte = pos + written_buffered - 1;
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
+ if (ret)
+ goto out;
+ ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (ret)
+ goto out;
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+out:
+ return ret < 0 ? ret : written;
+}
+
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ const struct iovec *iov1 = iter_iov(iter) + seg;
+ const struct iovec *iov2 = iter_iov(iter) + i;
+
+ if (iov1->iov_base == iov2->iov_base)
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+ return 0;
+
+ if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = btrfs_dio_read(iocb, to, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ return ret < 0 ? ret : read;
+}
+
+int __init btrfs_init_dio(void)
+{
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bbio.bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __cold btrfs_destroy_dio(void)
+{
+ bioset_exit(&btrfs_dio_bioset);
+}
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
new file mode 100644
index 000000000000..3dc3ea926afe
--- /dev/null
+++ b/fs/btrfs/direct-io.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIRECT_IO_H
+#define BTRFS_DIRECT_IO_H
+
+#include <linux/types.h>
+
+int __init btrfs_init_dio(void);
+void __cold btrfs_destroy_dio(void);
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from);
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to);
+
+#endif /* BTRFS_DIRECT_IO_H */
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 944a7340f6a4..de23c4b3515e 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
};
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
- struct btrfs_block_group *block_group)
+ const struct btrfs_block_group *block_group)
{
return &discard_ctl->discard_list[block_group->discard_index];
}
@@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
*
* Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
-static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_fs_info *fs_info = container_of(discard_ctl,
struct btrfs_fs_info,
@@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -250,6 +245,18 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
@@ -260,9 +267,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +501,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
+ return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +566,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ec411eb9c9b..e655fa3bfd9b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,7 +17,7 @@
#include <linux/error-injection.h>
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
@@ -29,8 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
@@ -75,20 +73,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -167,20 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + eb->folio_size);
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -196,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
* structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int failed = 0;
@@ -245,6 +262,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start = btrfs_header_bytenr(eb);
+ u64 last_trans;
u8 result[BTRFS_CSUM_SIZE];
int ret;
@@ -254,15 +272,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -282,12 +305,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
* Also check the generation, the eb reached here must be newer than
* last committed. Or something seriously wrong happened.
*/
- if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"block=%llu bad generation, have %llu expect > %llu",
- eb->start, btrfs_header_generation(eb),
- fs_info->last_trans_committed);
+ eb->start, btrfs_header_generation(eb), last_trans);
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
@@ -318,9 +341,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
BTRFS_FSID_SIZE);
/*
- * alloc_fs_devices() copies the fsid into metadata_uuid if the
- * metadata_uuid is unset in the superblock, including for a seed device.
- * So, we can use fs_devices->metadata_uuid.
+ * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
+ * This is then overwritten by metadata_uuid if it is present in the
+ * device_list_add(). The same true for a seed device as well. So use of
+ * fs_devices::metadata_uuid is appropriate here.
*/
if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
return false;
@@ -334,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
/* Do basic extent buffer checks at read time */
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -343,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
u8 result[BTRFS_CSUM_SIZE];
const u8 *header_csum;
int ret = 0;
+ const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
ASSERT(check);
@@ -370,18 +395,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
-"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
- btrfs_header_level(eb));
- ret = -EUCLEAN;
- goto out;
+ btrfs_header_level(eb),
+ ignore_csum ? ", ignored" : "");
+ if (!ignore_csum) {
+ ret = -EUCLEAN;
+ goto out;
+ }
}
if (found_level != check->level) {
@@ -401,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
if (check->has_first_key) {
- struct btrfs_key *expect_key = &check->first_key;
+ const struct btrfs_key *expect_key = &check->first_key;
struct btrfs_key found_key;
if (found_level)
@@ -473,15 +501,15 @@ static int btree_migrate_folio(struct address_space *mapping,
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct btrfs_fs_info *fs_info;
int ret;
if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_fs_info *fs_info;
if (wbc->for_kupdate)
return 0;
- fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ fs_info = inode_to_fs_info(mapping->host);
/* this is a bit racy, but that's ok */
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH,
@@ -497,18 +525,19 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return false;
- return try_release_extent_buffer(&folio->page);
+ return try_release_extent_buffer(folio);
}
static void btree_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(folio->mapping->host)->io_tree;
+
+ tree = &folio_to_inode(folio)->io_tree;
extent_invalidate_folio(tree, folio, offset);
btree_release_folio(folio, GFP_NOFS);
if (folio_get_private(folio)) {
- btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ btrfs_warn(folio_to_fs_info(folio),
"folio private not zero on folio %llu",
(unsigned long long)folio_pos(folio));
folio_detach_private(folio);
@@ -519,7 +548,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
static bool btree_dirty_folio(struct address_space *mapping,
struct folio *folio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
@@ -610,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, check->owner_root)) {
- free_extent_buffer_stale(buf);
- return ERR_PTR(-EUCLEAN);
- }
return buf;
}
@@ -621,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
- bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ bool dummy = btrfs_is_testing(fs_info);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -633,12 +658,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- root->last_trans = 0;
+ btrfs_set_root_last_trans(root, 0);
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
- root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ xa_init(&root->inodes);
+ xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -649,14 +674,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
- spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -675,9 +695,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
@@ -754,7 +774,7 @@ int btrfs_global_root_insert(struct btrfs_root *root)
if (tmp) {
ret = -EEXIST;
btrfs_warn(fs_info, "global root %llu %llu already exists",
- root->root_key.objectid, root->root_key.offset);
+ btrfs_root_id(root), root->root_key.offset);
}
return ret;
}
@@ -826,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
- if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
- return fs_info->block_group_root;
- return btrfs_extent_root(fs_info, 0);
-}
-
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -859,7 +872,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.offset = 0;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -936,7 +949,7 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+ NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf))
return PTR_ERR(leaf);
@@ -989,8 +1002,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- log_root->last_trans = trans->transid;
- log_root->root_key.offset = root->root_key.objectid;
+ btrfs_set_root_last_trans(log_root, trans->transid);
+ log_root->root_key.offset = btrfs_root_id(root);
inode_item = &log_root->root_item.inode;
btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1004,15 +1017,15 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
return 0;
}
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_path *path,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_tree_parent_check check = { 0 };
@@ -1054,15 +1067,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- root->root_key.objectid != btrfs_header_owner(root->node)) {
+ if (!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node)) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
- root->root_key.objectid, root->node->start,
+ btrfs_root_id(root), root->node->start,
btrfs_header_owner(root->node),
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EUCLEAN;
goto fail;
}
@@ -1074,7 +1087,7 @@ fail:
}
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1099,9 +1112,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_drew_lock_init(&root->snapshot_lock);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(root->root_key.objectid)) {
+ is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1110,7 +1123,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(root->root_key.objectid) &&
+ if (is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
@@ -1179,6 +1192,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(fs_info->block_group_root);
case BTRFS_FREE_SPACE_TREE_OBJECTID:
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->stripe_root);
default:
return NULL;
}
@@ -1195,7 +1210,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
root);
if (ret == 0) {
btrfs_grab_root(root);
@@ -1207,7 +1222,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
struct btrfs_root *root;
@@ -1220,6 +1235,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "leaked root %s refcount %d",
btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
+ WARN_ON_ONCE(1);
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
btrfs_put_root(root);
@@ -1241,9 +1257,14 @@ static void free_global_roots(struct btrfs_fs_info *fs_info)
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
+ struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
+ if (percpu_counter_initialized(em_counter))
+ ASSERT(percpu_counter_sum_positive(em_counter) == 0);
+ percpu_counter_destroy(em_counter);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -1259,11 +1280,11 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
+ btrfs_put_root(fs_info->stripe_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
- kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -1411,7 +1432,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * Return a root for the given objectid.
+ *
* @fs_info: the fs_info
* @objectid: the objectid we need to lookup
*
@@ -1708,11 +1730,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
}
/*
- * read_backup_root - Reads a backup root based on the passed priority. Prio 0
- * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
+ * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
- * fs_info - filesystem whose backup roots need to be read
- * priority - priority of backup root required
+ * @fs_info: filesystem whose backup roots need to be read
+ * @priority: priority of backup root required
*
* Returns backup root index on success and -EINVAL otherwise.
*/
@@ -1812,6 +1834,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
+ free_root_extent_buffers(info->stripe_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -1822,7 +1845,8 @@ void btrfs_put_root(struct btrfs_root *root)
return;
if (refcount_dec_and_test(&root->refs)) {
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (WARN_ON(!xa_empty(&root->inodes)))
+ xa_destroy(&root->inodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1896,7 +1920,7 @@ static int btrfs_init_btree_inode(struct super_block *sb)
if (!inode)
return -ENOMEM;
- inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
set_nlink(inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
@@ -1907,15 +1931,11 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
- BTRFS_I(inode)->location.type = 0;
- BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
fs_info->btree_inode = inode;
@@ -1939,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2114,7 +2134,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
/* If we have IGNOREDATACSUMS skip loading these roots. */
if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
return 0;
}
@@ -2148,8 +2168,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -2167,7 +2186,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
if (!found || ret) {
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
ret = ret ? ret : -ENOENT;
@@ -2212,7 +2231,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
struct btrfs_key location;
int ret;
- BUG_ON(!fs_info->tree_root);
+ ASSERT(fs_info->tree_root);
ret = load_global_roots(tree_root);
if (ret)
@@ -2271,7 +2290,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(root)) {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
fs_info->quota_root = root;
}
@@ -2288,6 +2306,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->stripe_root = root;
+ }
+ }
+
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2305,21 +2337,29 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
+ const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
- btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
- btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
- ret = -EINVAL;
+ if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
+ if (!ignore_flags) {
+ btrfs_err(fs_info,
+ "unrecognized or unsupported super flag 0x%llx",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ } else {
+ btrfs_info(fs_info,
+ "unrecognized or unsupported super flags: 0x%llx, ignored",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ }
}
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
@@ -2390,7 +2430,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+ if (!fs_info->fs_devices->temp_fsid &&
+ memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
sb->fsid, fs_info->fs_devices->fsid);
@@ -2421,7 +2462,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
!btrfs_fs_incompat(fs_info, NO_HOLES))) {
btrfs_err(fs_info,
- "block-group-tree feature requires fres-space-tree and no-holes");
+ "block-group-tree feature requires free-space-tree and no-holes");
ret = -EINVAL;
}
@@ -2542,7 +2583,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen,
- .owner_root = root->root_key.objectid
+ .owner_root = btrfs_root_id(root)
};
int ret = 0;
@@ -2607,9 +2648,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2643,7 +2681,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
/* All successful */
fs_info->generation = btrfs_header_generation(tree_root->node);
- fs_info->last_trans_committed = fs_info->generation;
+ btrfs_set_last_trans_committed(fs_info, fs_info->generation);
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
@@ -2713,7 +2751,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2744,11 +2783,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->ordered_root_lock);
btrfs_init_scrub(fs_info);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- fs_info->check_integrity_print_mask = 0;
-#endif
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
+ btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -2786,6 +2823,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2808,6 +2848,12 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2832,6 +2878,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (sb_rdonly(sb))
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+ if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
+ set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
return btrfs_alloc_stripe_hash_table(fs_info);
}
@@ -2877,22 +2925,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i = 0;
- int err = 0;
- unsigned int ret = 0;
+ int ret = 0;
while (1) {
+ unsigned int found;
+
spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret) {
+ if (!found) {
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ root_objectid = btrfs_root_id(gang[found - 1]) + 1;
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
/* Avoid to grab roots in dead_roots. */
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
gang[i] = NULL;
@@ -2903,35 +2951,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(gang[i]);
- if (err)
- goto out;
+ root_objectid = btrfs_root_id(gang[i]);
+ /*
+ * Continue to release the remaining roots after the first
+ * error without cleanup and preserve the first error
+ * for the return.
+ */
+ if (!ret)
+ ret = btrfs_orphan_cleanup(gang[i]);
btrfs_put_root(gang[i]);
}
+ if (ret)
+ break;
+
root_objectid++;
}
-out:
- /* Release the uncleaned roots due to error. */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
- }
- return err;
-}
-
-/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ return ret;
}
/*
@@ -2946,7 +2984,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3160,14 +3202,12 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
return 0;
}
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- char *options)
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
{
u32 sectorsize;
u32 nodesize;
u32 stripesize;
u64 generation;
- u64 features;
u16 csum_type;
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -3250,15 +3290,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
disk_super = fs_info->super_copy;
-
- features = btrfs_super_flags(disk_super);
- if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
- features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
- btrfs_set_super_flags(disk_super, features);
- btrfs_info(fs_info,
- "found metadata UUID change in progress flag, clearing");
- }
-
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
@@ -3279,13 +3310,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,42 +3320,35 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
- if (sectorsize < PAGE_SIZE) {
- struct btrfs_subpage_info *subpage_info;
-
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+ if (sectorsize < PAGE_SIZE)
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
- if (!subpage_info) {
- ret = -ENOMEM;
- goto fail_alloc;
- }
- btrfs_init_subpage_info(subpage_info, sectorsize);
- fs_info->subpage_info = subpage_info;
- }
ret = btrfs_init_workqueues(fs_info);
if (ret)
@@ -3498,41 +3515,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
- ret = btrfsic_mount(fs_info, fs_devices,
- btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_DATA) ? 1 : 0,
- fs_info->check_integrity_print_mask);
- if (ret)
- btrfs_warn(fs_info,
- "failed to initialize integrity check module: %d",
- ret);
- }
-#endif
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3558,7 +3540,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3586,8 +3568,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3624,7 +3604,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -3637,28 +3617,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
static void btrfs_end_super_write(struct bio *bio)
{
struct btrfs_device *device = bio->bi_private;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
- struct page *page;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
+ struct folio_iter fi;
+ bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s (%d)",
+ "lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
- ClearPageUptodate(page);
- SetPageError(page);
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_WRITE_ERRS);
- } else {
- SetPageUptodate(page);
+ /* Ensure failure if the primary sb fails. */
+ if (bio->bi_opf & REQ_FUA)
+ atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
+ &device->sb_write_errors);
+ else
+ atomic_inc(&device->sb_write_errors);
}
-
- put_page(page);
- unlock_page(page);
+ folio_unlock(fi.folio);
+ folio_put(fi.folio);
}
bio_put(bio);
@@ -3670,7 +3647,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
struct btrfs_super_block *super;
struct page *page;
u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
int ret;
bytenr_orig = btrfs_sb_offset(copy_num);
@@ -3745,34 +3722,36 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * pages we use for writing are locked.
+ * folios we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when page is not found or submission fails.
+ * Return number of errors when folio is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+ struct address_space *mapping = device->bdev->bd_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
- int errors = 0;
int ret;
u64 bytenr, bytenr_orig;
+ atomic_set(&device->sb_write_errors, 0);
+
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
struct bio *bio;
struct btrfs_super_block *disk_super;
+ size_t offset;
bytenr_orig = btrfs_sb_offset(i);
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -3782,7 +3761,7 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_err(device->fs_info,
"couldn't get super block location for mirror %d",
i);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3795,20 +3774,20 @@ static int write_dev_supers(struct btrfs_device *device,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
sb->csum);
- page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
- GFP_NOFS);
- if (!page) {
+ folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_NOFS);
+ if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
"couldn't get super block page for bytenr %llu",
bytenr);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
+ ASSERT(folio_order(folio) == 0);
- /* Bump the refcount for wait_dev_supers() */
- get_page(page);
-
- disk_super = page_address(page);
+ offset = offset_in_folio(folio, bytenr);
+ disk_super = folio_address(folio) + offset;
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
/*
@@ -3822,8 +3801,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
- __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
- offset_in_page(bytenr));
+ bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
/*
* We FUA only the first super block. The others we allow to
@@ -3832,22 +3810,20 @@ static int write_dev_supers(struct btrfs_device *device,
*/
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
- errors++;
+ atomic_inc(&device->sb_write_errors);
}
- return errors < i ? 0 : -1;
+ return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
}
/*
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when page is not found or not marked up to
- * date.
+ * Return -1 if primary super block write failed or when there were no super block
+ * copies written. Otherwise 0.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
@@ -3861,7 +3837,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
if (ret == -ENOENT) {
@@ -3876,30 +3852,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- page = find_get_page(device->bdev->bd_inode->i_mapping,
- bytenr >> PAGE_SHIFT);
- if (!page) {
- errors++;
- if (i == 0)
- primary_failed = true;
+ folio = filemap_get_folio(device->bdev->bd_mapping,
+ bytenr >> PAGE_SHIFT);
+ /* If the folio has been removed, then we know it completed. */
+ if (IS_ERR(folio))
continue;
- }
- /* Page is submitted locked and unlocked once the IO completes */
- wait_on_page_locked(page);
- if (PageError(page)) {
- errors++;
- if (i == 0)
- primary_failed = true;
- }
+ ASSERT(folio_order(folio) == 0);
- /* Drop our reference */
- put_page(page);
-
- /* Drop the reference from the writing run */
- put_page(page);
+ /* Folio will be unlocked once the write completes. */
+ folio_wait_locked(folio);
+ folio_put(folio);
}
- /* log error, force error return */
+ errors += atomic_read(&device->sb_write_errors);
+ if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
+ primary_failed = true;
if (primary_failed) {
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
device->devid);
@@ -3929,28 +3896,11 @@ static void write_dev_flush(struct btrfs_device *device)
device->last_flush_error = BLK_STS_OK;
-#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * When a disk has write caching disabled, we skip submission of a bio
- * with flush and sync requests before writing the superblock, since
- * it's not needed. However when the integrity checker is enabled, this
- * results in reports that there are metadata blocks referred by a
- * superblock that were not properly flushed. So don't skip the bio
- * submission only when the integrity checker is enabled for the sake
- * of simplicity, since this is a debug tool and not meant for use in
- * non-debug builds.
- */
- if (!bdev_write_cache(device->bdev))
- return;
-#endif
-
bio_init(bio, device->bdev, NULL, 0,
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4177,7 +4127,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid);
+ (unsigned long)btrfs_root_id(root));
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -4196,9 +4146,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_trans_handle *trans;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -4208,10 +4155,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
@@ -4220,9 +4164,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
struct btrfs_transaction *tmp;
bool found = false;
- if (list_empty(&fs_info->trans_list))
- return;
-
/*
* This function is only called at the very end of close_ctree(),
* thus no other running transaction, no need to take trans_lock.
@@ -4314,6 +4255,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4323,6 +4272,40 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
* already the cleaner, but below we run all pending delayed iputs.
*/
btrfs_flush_workqueue(fs_info->fixup_workers);
+ /*
+ * Similar case here, we have to wait for delalloc workers before we
+ * proceed below and stop the cleaner kthread, otherwise we trigger a
+ * use-after-tree on the cleaner kthread task_struct when a delalloc
+ * worker running submit_compressed_extents() adds a delayed iput, which
+ * does a wake up on the cleaner kthread, which was already freed below
+ * when we call kthread_stop().
+ */
+ btrfs_flush_workqueue(fs_info->delalloc_workers);
+
+ /*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
/*
* After we parked the cleaner kthread, ordered extents may have
@@ -4352,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->extent_map_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4381,9 +4365,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4437,12 +4418,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
- btrfsic_unmount(fs_info->fs_devices);
-#endif
-
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
@@ -4464,22 +4440,13 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
ASSERT(trans->transid == fs_info->generation);
btrfs_assert_tree_write_locked(buf);
- if (transid != fs_info->generation) {
- WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
- buf->start, transid, fs_info->generation);
+ if (unlikely(transid != fs_info->generation)) {
btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
+ buf->start, transid, fs_info->generation);
}
set_extent_buffer_dirty(buf);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * btrfs_check_leaf() won't check item data if we don't have WRITTEN
- * set, so this will only validate the basic structure of the items.
- */
- if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
- btrfs_print_leaf(buf);
- ASSERT(0);
- }
-#endif
}
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
@@ -4520,10 +4487,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4547,7 +4510,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
+ root_objectid = btrfs_root_id(gang[i]);
btrfs_free_log(NULL, gang[i]);
btrfs_put_root(gang[i]);
}
@@ -4600,7 +4563,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
* extents that haven't had their dirty pages IO start writeout yet
* actually get run and error out properly.
*/
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4631,6 +4594,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
if (head->must_insert_reserved)
pin_bytes = true;
@@ -4683,7 +4647,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
struct inode *inode = NULL;
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
delalloc_inodes);
- __btrfs_del_delalloc_inode(root, btrfs_inode);
+ btrfs_del_delalloc_inode(btrfs_inode);
spin_unlock(&root->delalloc_lock);
/*
@@ -4828,7 +4792,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4869,7 +4833,7 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
btrfs_qgroup_free_meta_all_pertrans(root);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
}
}
@@ -4898,14 +4862,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&fs_info->transaction_wait);
- btrfs_destroy_delayed_inodes(fs_info);
-
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
- btrfs_free_all_qgroup_pertrans(fs_info);
-
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
@@ -4958,6 +4918,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_all_delalloc_inodes(fs_info);
btrfs_drop_all_logs(fs_info);
+ btrfs_free_all_qgroup_pertrans(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
@@ -4982,7 +4943,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto error;
+ }
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
@@ -5006,7 +4974,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -ENOSPC;
goto out;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index fca52385830c..127e31e08347 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,6 +6,22 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
+#include <linux/sizes.h>
+#include <linux/compiler_types.h>
+#include "ctree.h"
+#include "fs.h"
+
+struct block_device;
+struct super_block;
+struct extent_buffer;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info;
+struct btrfs_super_block;
+struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
+struct btrfs_transaction;
+
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -25,11 +41,7 @@ static inline u64 btrfs_sb_offset(int mirror)
return BTRFS_SUPER_INFO_OFFSET;
}
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_tree_parent_check;
-
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check);
@@ -37,18 +49,13 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
- struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num);
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
@@ -56,7 +63,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key);
+ const struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
@@ -74,7 +81,6 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
@@ -82,7 +88,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -109,7 +115,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 203e5964c9b0..e2b22bea348a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
#include "ctree.h"
#include "disk-io.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "export.h"
#include "accessors.h"
#include "super.h"
@@ -35,15 +34,15 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
+ fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root);
fid->gen = inode->i_generation;
if (parent) {
u64 parent_root_id;
- fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_objectid = btrfs_ino(BTRFS_I(parent));
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
+ parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
@@ -85,7 +84,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(root))
return ERR_CAST(root);
- inode = btrfs_iget(sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
btrfs_put_root(root);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -161,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
return ERR_PTR(-ENOMEM);
if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
@@ -211,7 +210,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
+ return d_obtain_alias(btrfs_iget(key.objectid, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -222,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
{
struct inode *inode = d_inode(child);
struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode_ref *iref;
@@ -244,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index eba6bc4f5a61..464582273af9 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -4,6 +4,10 @@
#define BTRFS_EXPORT_H
#include <linux/exportfs.h>
+#include <linux/types.h>
+
+struct dentry;
+struct super_block;
extern const struct export_operations btrfs_export_ops;
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ff8e117a1ace..5f9a43734812 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -4,9 +4,9 @@
#include <trace/events/btrfs.h>
#include "messages.h"
#include "ctree.h"
+#include "extent_io.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
-#include "misc.h"
static struct kmem_cache *extent_state_cache;
@@ -48,6 +48,7 @@ static inline void btrfs_extent_state_leak_debug_check(void)
extent_state_in_tree(state),
refcount_read(&state->refs));
list_del(&state->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -58,12 +59,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct btrfs_inode *inode = tree->inode;
+ const struct btrfs_inode *inode;
u64 isize;
- if (!inode)
+ if (tree->owner != IO_TREE_INODE_IO)
return;
+ inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -78,59 +80,82 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
+
/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
*/
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+ return tree->owner == IO_TREE_INODE_IO;
+}
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode->root->fs_info;
+ return tree->fs_info;
+}
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner)
{
- tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->inode = NULL;
+ tree->fs_info = fs_info;
tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
+/*
+ * Empty an io tree, removing and freeing every extent state record from the
+ * tree. This should be called once we are sure no other task can access the
+ * tree anymore, so no tree updates happen after we empty the tree and there
+ * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
+ * set on any extent state when calling this function).
+ */
void extent_io_tree_release(struct extent_io_tree *tree)
{
+ struct rb_root root;
+ struct extent_state *state;
+ struct extent_state *tmp;
+
spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
+ root = tree->state;
+ tree->state = RB_ROOT;
+ rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
+ /* Clear node to keep free_extent_state() happy. */
RB_CLEAR_NODE(&state->rb_node);
+ ASSERT(!(state->state & EXTENT_LOCK_BITS));
/*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
+ * No need for a memory barrier here, as we are holding the tree
+ * lock and we only change the waitqueue while holding that lock
+ * (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
-
cond_resched_lock(&tree->lock);
}
+ /*
+ * Should still be empty even after a reschedule, no other task should
+ * be accessing the tree anymore.
+ */
+ ASSERT(RB_EMPTY_ROOT(&tree->state));
spin_unlock(&tree->lock);
}
@@ -321,10 +346,46 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
+ btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ "extent io tree error on %s state start %llu end %llu",
+ opname, state->start, state->end);
+}
+
+static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *prev;
+
+ prev = prev_state(state);
+ if (prev && prev->end == state->start - 1 && prev->state == state->state) {
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, prev);
+ state->start = prev->start;
+ rb_erase(&prev->rb_node, &tree->state);
+ RB_CLEAR_NODE(&prev->rb_node);
+ free_extent_state(prev);
+ }
+}
+
+static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *next;
+
+ next = next_state(state);
+ if (next && next->start == state->end + 1 && next->state == state->state) {
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, next);
+ state->end = next->end;
+ rb_erase(&next->rb_node, &tree->state);
+ RB_CLEAR_NODE(&next->rb_node);
+ free_extent_state(next);
+ }
}
/*
@@ -338,31 +399,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
- struct extent_state *other;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
return;
- other = prev_state(state);
- if (other && other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- other = next_state(state);
- if (other && other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
+ merge_prev_state(tree, state);
+ merge_next_state(tree, state);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -372,8 +413,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_set_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -384,19 +425,27 @@ static void set_state_bits(struct extent_io_tree *tree,
* Insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
+ * Returns a pointer to the struct extent_state record containing the range
+ * requested for insertion, which may be the same as the given struct or it
+ * may be an existing record in the tree that was expanded to accommodate the
+ * requested range. In case of an extent_state different from the one that was
+ * given, the later can be freed or reused by the caller.
+ *
+ * On error it returns an error pointer.
*
* The tree lock is not taken internally. This is a utility function and
* probably isn't what you want to call (see set/clear_extent_bit).
*/
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
+static struct extent_state *insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits,
+ struct extent_changeset *changeset)
{
struct rb_node **node;
struct rb_node *parent = NULL;
- const u64 end = state->end;
+ const u64 start = state->start - 1;
+ const u64 end = state->end + 1;
+ const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
set_state_bits(tree, state, bits, changeset);
@@ -407,23 +456,41 @@ static int insert_state(struct extent_io_tree *tree,
parent = *node;
entry = rb_entry(parent, struct extent_state, rb_node);
- if (end < entry->start) {
+ if (state->end < entry->start) {
+ if (try_merge && end == entry->start &&
+ state->state == entry->state) {
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
+ entry->start = state->start;
+ merge_prev_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_left;
- } else if (end > entry->end) {
+ } else if (state->end > entry->end) {
+ if (try_merge && entry->end == start &&
+ state->state == entry->state) {
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
+ entry->end = state->end;
+ merge_next_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_right;
} else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, end);
- return -EEXIST;
+ return ERR_PTR(-EEXIST);
}
}
rb_link_node(&state->rb_node, parent, node);
rb_insert_color(&state->rb_node, &tree->state);
- merge_state(tree, state);
- return 0;
+ return state;
}
/*
@@ -460,8 +527,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->inode)
- btrfs_split_delalloc_extent(tree->inode, orig, split);
+ if (is_inode_io_tree(tree))
+ btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+ split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -508,8 +576,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_clear_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+ bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -547,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
* inserting elements in the tree, so the gfp mask is used to indicate which
* allocations or sleeping are allowed.
*
- * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
- * range from the tree regardless of state (ie for truncate).
- *
* The range [start, end] is inclusive.
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
@@ -578,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
- wake = (bits & EXTENT_LOCKED) ? 1 : 0;
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
+ if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc) {
@@ -650,7 +716,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -672,7 +738,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
if (wake)
wake_up(&state->wq);
@@ -708,26 +774,13 @@ out:
}
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
/*
* Wait for one or more bits to clear on a range in the state tree.
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
struct extent_state *state;
@@ -758,9 +811,15 @@ process_node:
goto out;
if (state->state & bits) {
+ DEFINE_WAIT(wait);
+
start = state->start;
refcount_inc(&state->refs);
- wait_on_state(tree, state);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
free_extent_state(state);
goto again;
}
@@ -799,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state,
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
+ return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY);
}
/*
@@ -847,10 +905,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
if (state->end == start - 1 && extent_state_in_tree(state)) {
while ((state = next_state(state)) != NULL) {
if (state->state & bits)
- goto got_it;
+ break;
}
+ /*
+ * If we found the next extent state, clear cached_state
+ * so that we can cache the next extent state below and
+ * avoid future calls going over the same extent state
+ * again. If we haven't found any, clear as well since
+ * it's now useless.
+ */
free_extent_state(*cached_state);
*cached_state = NULL;
+ if (state)
+ goto got_it;
goto out;
}
free_extent_state(*cached_state);
@@ -892,6 +959,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state *state;
int ret = 1;
+ ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
@@ -987,10 +1056,10 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
- u32 exclusive_bits = (bits & EXTENT_LOCKED);
+ u32 exclusive_bits = (bits & EXTENT_LOCK_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -1012,6 +1081,9 @@ again:
*/
prealloc = alloc_extent_state(mask);
}
+ /* Optimistically preallocate the extent changeset ulist node. */
+ if (changeset)
+ extent_changeset_prealloc(changeset, mask);
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
@@ -1050,7 +1122,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = state->start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1086,7 +1158,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1103,12 +1175,12 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, changeset);
@@ -1133,6 +1205,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1148,12 +1222,15 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
+ inserted_state = insert_state(tree, prealloc, bits, changeset);
+ if (IS_ERR(inserted_state)) {
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
+ }
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1167,16 +1244,19 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1198,7 +1278,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
@@ -1235,7 +1315,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
bool first_iteration = true;
@@ -1274,7 +1354,7 @@ again:
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
prealloc->start = start;
@@ -1325,14 +1405,14 @@ hit_next:
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
@@ -1356,6 +1436,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1363,7 +1445,7 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -1373,11 +1455,15 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ inserted_state = insert_state(tree, prealloc, bits, NULL);
+ if (IS_ERR(inserted_state)) {
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
+ }
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1390,13 +1476,13 @@ hit_next:
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
@@ -1418,7 +1504,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
/*
@@ -1640,15 +1726,46 @@ search:
}
/*
- * Search a range in the state tree for a given mask. If 'filled' == 1, this
- * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
- * is returned if any bit in the range is found set.
+ * Check if the single @bit exists in the given range.
*/
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
{
struct extent_state *state = NULL;
- int bitset = 0;
+ bool bitset = false;
+
+ ASSERT(is_power_of_2(bit));
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (state->start > end)
+ break;
+
+ if (state->state & bit) {
+ bitset = true;
+ break;
+ }
+
+ /* If state->end is (u64)-1, start will overflow to 0 */
+ start = state->end + 1;
+ if (start > end || start == 0)
+ break;
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * Check if the whole range [@start,@end) contains the single @bit set.
+ */
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ bool bitset = true;
+
+ ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1657,35 +1774,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
else
state = tree_search(tree, start);
while (state && start <= end) {
- if (filled && state->start > start) {
- bitset = 0;
+ if (state->start > start) {
+ bitset = false;
break;
}
if (state->start > end)
break;
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
+ if ((state->state & bit) == 0) {
+ bitset = false;
break;
}
if (state->end == (u64)-1)
break;
+ /*
+ * Last entry (if state->end is (u64)-1 and overflow happens),
+ * or next entry starts after the range.
+ */
start = state->end + 1;
- if (start > end)
+ if (start > end || start == 0)
break;
state = next_state(state);
}
/* We ran out of states and were still inside of our range. */
- if (filled && !state)
- bitset = 0;
+ if (!state)
+ bitset = false;
spin_unlock(&tree->lock);
return bitset;
}
@@ -1695,12 +1812,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
+ * We don't support EXTENT_LOCK_BITS yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCK_BITS case, it will either
+ * fail with -EEXIST or changeset will record the whole range.
*/
- ASSERT(!(bits & EXTENT_LOCKED));
+ ASSERT(!(bits & EXTENT_LOCK_BITS));
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
@@ -1709,26 +1825,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
- * Don't support EXTENT_LOCKED case, same reason as
+ * Don't support EXTENT_LOCK_BITS case, same reason as
* set_record_extent_bits().
*/
- ASSERT(!(bits & EXTENT_LOCKED));
+ ASSERT(!(bits & EXTENT_LOCK_BITS));
return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
+bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached)
{
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ err = __set_extent_bit(tree, start, end, bits, &failed_start,
NULL, cached, NULL);
if (err == -EEXIST) {
if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, cached);
+ clear_extent_bit(tree, start, failed_start - 1, bits, cached);
return 0;
}
return 1;
@@ -1738,23 +1853,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
+int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ err = __set_extent_bit(tree, start, end, bits, &failed_start,
&failed_state, cached_state, NULL);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, cached_state);
+ bits, cached_state);
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
- &failed_state);
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ wait_extent_bit(tree, failed_start, end, bits, &failed_state);
+ err = __set_extent_bit(tree, start, end, bits,
&failed_start, &failed_state,
cached_state, NULL);
}
@@ -1770,8 +1884,8 @@ void __cold extent_state_free_cachep(void)
int __init extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_state), 0, 0,
+ NULL);
if (!extent_state_cache)
return -ENOMEM;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 28c23a23d121..6ffef1cd37c1 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,15 +3,23 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/wait.h>
#include "misc.h"
struct extent_changeset;
+struct btrfs_fs_info;
+struct btrfs_inode;
/* Bits for the extent state */
enum {
ENUM_BIT(EXTENT_DIRTY),
ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
+ ENUM_BIT(EXTENT_DIO_LOCKED),
ENUM_BIT(EXTENT_NEW),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
@@ -60,6 +68,8 @@ enum {
EXTENT_ADD_INODE_BYTES | \
EXTENT_CLEAR_ALL_BITS)
+#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED)
+
/*
* Redefined bits above which are used only in the device allocation tree,
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
@@ -87,9 +97,17 @@ enum {
struct extent_io_tree {
struct rb_root state;
- struct btrfs_fs_info *fs_info;
- /* Inode associated with this tree, or NULL. */
- struct btrfs_inode *inode;
+ /*
+ * The fs_info is needed for trace points, a tree attached to an inode
+ * needs the inode.
+ *
+ * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+ * accessed as inode->root->fs_info
+ */
+ union {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_inode *inode;
+ };
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -112,15 +130,29 @@ struct extent_state {
#endif
};
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
+int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
+bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
+{
+ return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
+static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+}
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
@@ -131,8 +163,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached_state);
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state);
+static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+}
+
+static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+}
+
+static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
+}
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 021cf468274b..bb3602059906 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,7 +18,7 @@
#include <linux/crc32c.h>
#include "ctree.h"
#include "extent-tree.h"
-#include "tree-log.h"
+#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "volumes.h"
@@ -26,14 +26,11 @@
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "sysfs.h"
#include "qgroup.h"
#include "ref-verify.h"
#include "space-info.h"
#include "block-rsv.h"
-#include "delalloc-space.h"
#include "discard.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
@@ -42,14 +39,14 @@
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -57,7 +54,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod);
+ struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
@@ -100,18 +97,17 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owning_root)
{
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
struct btrfs_key key;
- u32 item_size;
u64 num_refs;
u64 extent_flags;
+ u64 owner = 0;
int ret;
/*
@@ -127,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- if (!trans) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
- }
-
search_again:
key.objectid = bytenr;
key.offset = offset;
@@ -145,7 +136,7 @@ search_again:
if (ret < 0)
goto out_free;
- if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -158,36 +149,37 @@ search_again:
}
if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- } else {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
+ btrfs_abort_transaction(trans, ret);
goto out_free;
}
- BUG_ON(num_refs == 0);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ if (unlikely(num_refs == 0)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent item (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
} else {
num_refs = 0;
extent_flags = 0;
ret = 0;
}
- if (!trans)
- goto out;
-
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
@@ -210,20 +202,20 @@ search_again:
spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
-out:
+
WARN_ON(num_refs == 0);
if (refs)
*refs = num_refs;
if (flags)
*flags = extent_flags;
+ if (owning_root)
+ *owning_root = owner;
out_free:
btrfs_free_path(path);
return ret;
@@ -344,9 +336,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int type = btrfs_extent_inline_ref_type(eb, iref);
u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ return type;
+ }
+
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
type == BTRFS_SHARED_BLOCK_REF_KEY ||
type == BTRFS_SHARED_DATA_REF_KEY ||
@@ -355,26 +353,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_TREE_BLOCK_REF_KEY)
return type;
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
- if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return type;
if (type == BTRFS_SHARED_DATA_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else {
@@ -385,7 +382,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
WARN_ON(1);
btrfs_print_leaf(eb);
- btrfs_err(eb->fs_info,
+ btrfs_err(fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
@@ -399,11 +396,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -438,9 +435,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
u32 nritems;
- int ret;
int recow;
- int err = -ENOENT;
+ int ret;
key.objectid = bytenr;
if (parent) {
@@ -454,26 +450,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
again:
recow = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
+ if (ret < 0)
+ return ret;
if (parent) {
- if (!ret)
- return 0;
- goto fail;
+ if (ret)
+ return -ENOENT;
+ return 0;
}
+ ret = -ENOENT;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- err = ret;
- if (ret)
- goto fail;
+ if (ret) {
+ if (ret > 0)
+ return -ENOENT;
+ return ret;
+ }
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -494,37 +490,37 @@ again:
btrfs_release_path(path);
goto again;
}
- err = 0;
+ ret = 0;
break;
}
path->slots[0]++;
}
fail:
- return err;
+ return ret;
}
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid, u64 owner,
- u64 offset, int refs_to_add)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u32 size;
u32 num_refs;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_DATA_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
size = sizeof(struct btrfs_shared_data_ref);
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
- key.offset = hash_extent_data_ref(root_objectid,
- owner, offset);
+ key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
size = sizeof(struct btrfs_extent_data_ref);
}
@@ -533,15 +529,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
goto fail;
leaf = path->nodes[0];
- if (parent) {
+ if (node->parent) {
struct btrfs_shared_data_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
if (ret == 0) {
- btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_shared_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
}
} else {
@@ -549,7 +545,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
while (ret == -EEXIST) {
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
- if (match_extent_data_ref(leaf, ref, root_objectid,
+ if (match_extent_data_ref(leaf, ref, node->ref_root,
owner, offset))
break;
btrfs_release_path(path);
@@ -564,14 +560,13 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (ret == 0) {
- btrfs_set_extent_data_ref_root(leaf, ref,
- root_objectid);
+ btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
- btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_extent_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
@@ -695,20 +690,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
} else {
key.type = BTRFS_TREE_BLOCK_REF_KEY;
- key.offset = root_objectid;
+ key.offset = node->ref_root;
}
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -789,7 +784,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
int type;
int want;
int ret;
- int err = 0;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
int needed;
@@ -816,10 +810,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
/*
* We may be a newly converted file system which still has the old fat
@@ -846,7 +838,7 @@ again:
}
if (ret && !insert) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
btrfs_print_leaf(path->nodes[0]);
@@ -854,18 +846,18 @@ again:
"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
bytenr, num_bytes, parent, root_objectid, owner,
offset);
- err = -EIO;
+ ret = -EUCLEAN;
goto out;
}
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %llu expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -885,22 +877,17 @@ again:
else
needed = BTRFS_REF_TYPE_BLOCK;
- err = -ENOENT;
- while (1) {
- if (ptr >= end) {
- if (ptr > end) {
- err = -EUCLEAN;
- btrfs_print_leaf(path->nodes[0]);
- btrfs_crit(fs_info,
-"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
- path->slots[0], root_objectid, owner, offset, parent);
- }
- break;
- }
+ ret = -ENOENT;
+ while (ptr < end) {
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
if (type == BTRFS_REF_TYPE_INVALID) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
goto out;
}
@@ -916,7 +903,7 @@ again:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (match_extent_data_ref(leaf, dref, root_objectid,
owner, offset)) {
- err = 0;
+ ret = 0;
break;
}
if (hash_extent_data_ref_item(leaf, dref) <
@@ -927,14 +914,14 @@ again:
ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
if (parent > 0) {
if (parent == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < parent)
break;
} else {
if (root_objectid == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < root_objectid)
@@ -943,10 +930,20 @@ again:
}
ptr += btrfs_extent_inline_ref_size(type);
}
- if (err == -ENOENT && insert) {
+
+ if (unlikely(ptr > end)) {
+ ret = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ goto out;
+ }
+
+ if (ret == -ENOENT && insert) {
if (item_size + extra_size >=
BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -958,7 +955,7 @@ again:
if (find_next_key(path, 0, &key) == 0 &&
key.objectid == bytenr &&
key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
}
@@ -969,7 +966,7 @@ out:
path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
- return err;
+ return ret;
}
/*
@@ -1443,7 +1440,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1456,7 +1453,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
}
/*
- * __btrfs_inc_extent_ref - insert backreference for a given extent
+ * Insert backreference for a given extent.
*
* The counterpart is in __btrfs_free_extent(), with examples and more details
* how it works.
@@ -1466,36 +1463,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* @node: The delayed ref node used to get the bytenr/length for
* extent whose references are incremented.
*
- * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
- * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
- * bytenr of the parent block. Since new extents are always
- * created with indirect references, this will only be the case
- * when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
- * be 0
- *
- * @root_objectid: The id of the root where this modification has originated,
- * this can be either one of the well-known metadata trees or
- * the subvolume id which references this extent.
- *
- * @owner: For data extents it is the inode number of the owning file.
- * For metadata extents this parameter holds the level in the
- * tree of the extent.
- *
- * @offset: For metadata extents the offset is ignored and is currently
- * always passed as 0. For data extents it is the fileoffset
- * this extent belongs to.
- *
- * @refs_to_add Number of references to add
- *
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
@@ -1504,7 +1477,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_key key;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u64 refs;
+ int refs_to_add = node->ref_mod;
int ret;
path = btrfs_alloc_path();
@@ -1513,7 +1489,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
- parent, root_objectid, owner,
+ node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
@@ -1536,12 +1512,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID)
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
+ ret = insert_tree_block_ref(trans, path, node, bytenr);
else
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
+ ret = insert_extent_data_ref(trans, path, node, bytenr);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -1550,45 +1523,68 @@ out:
return ret;
}
+static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *href)
+{
+ u64 root = href->owning_root;
+
+ /*
+ * Don't check must_insert_reserved, as this is called from contexts
+ * where it has already been unset.
+ */
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
+ !href->is_data || !is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
+ BTRFS_QGROUP_RSV_DATA);
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
- struct btrfs_delayed_data_ref *ref;
- struct btrfs_key ins;
u64 parent = 0;
- u64 ref_root = 0;
u64 flags = 0;
- ins.objectid = node->bytenr;
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_data_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
- parent = ref->parent;
- ref_root = ref->root;
+ parent = node->parent;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ struct btrfs_key key;
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = node->num_bytes,
+ .is_data = true,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
+
if (extent_op)
flags |= extent_op->flags_to_set;
- ret = alloc_reserved_file_extent(trans, parent, ref_root,
- flags, ref->objectid,
- ref->offset, &ins,
- node->ref_mod);
+
+ key.objectid = node->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+
+ ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
+ flags, owner, offset, &key,
+ node->ref_mod,
+ href->owning_root);
+ free_head_ref_squota_rsv(trans->fs_info, href);
+ if (!ret)
+ ret = btrfs_record_squota_delta(trans->fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->objectid, ref->offset,
- node->ref_mod, extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent,
- ref_root, ref->objectid,
- ref->offset, node->ref_mod,
- extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1625,7 +1621,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 item_size;
int ret;
- int err = 0;
int metadata = 1;
if (TRANS_ABORTED(trans))
@@ -1642,7 +1637,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = extent_op->level;
+ key.offset = head->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = head->num_bytes;
@@ -1652,10 +1647,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- err = ret;
goto out;
- }
- if (ret > 0) {
+ } else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -1676,10 +1669,10 @@ again:
goto again;
}
} else {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
- head->bytenr, head->num_bytes, extent_op->level);
+ head->bytenr, head->num_bytes, head->level);
goto out;
}
}
@@ -1688,11 +1681,11 @@ again:
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1702,25 +1695,25 @@ again:
btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
- struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 parent = 0;
u64 ref_root = 0;
- ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_tree_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- parent = ref->parent;
- ref_root = ref->root;
+ parent = node->parent;
+ ref_root = node->ref_root;
if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
@@ -1730,14 +1723,21 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
- BUG_ON(!extent_op || !extent_op->update_flags);
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = fs_info->nodesize,
+ .is_data = false,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+
ret = alloc_reserved_tree_block(trans, node, extent_op);
+ if (!ret)
+ btrfs_record_squota_delta(fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1746,6 +1746,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
@@ -1753,19 +1754,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
int ret = 0;
if (TRANS_ABORTED(trans)) {
- if (insert_reserved)
+ if (insert_reserved) {
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ free_head_ref_squota_rsv(trans->fs_info, href);
+ }
return 0;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = run_delayed_tree_ref(trans, node, extent_op,
+ ret = run_delayed_tree_ref(trans, href, node, extent_op,
insert_reserved);
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
- ret = run_delayed_data_ref(trans, node, extent_op,
+ ret = run_delayed_data_ref(trans, href, node, extent_op,
insert_reserved);
+ else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
+ ret = 0;
else
BUG();
if (ret && insert_reserved)
@@ -1844,28 +1849,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
return ret ? ret : 1;
}
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- int nr_items = 1; /* Dropping this ref head update. */
+ u64 ret = 0;
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
+ int nr_csums;
+
spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+
+ btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
+
+ ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
}
+ /* must_insert_reserved can be set only if we didn't run the head ref. */
+ if (head->must_insert_reserved)
+ free_head_ref_squota_rsv(fs_info, head);
- btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+ return ret;
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+ struct btrfs_delayed_ref_head *head,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1910,7 +1925,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
@@ -1952,7 +1967,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *locked_ref)
+ struct btrfs_delayed_ref_head *locked_ref,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -2000,14 +2016,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
+ /*
+ * Unsetting this on the head ref relinquishes ownership of
+ * the rsv_bytes, so it is critical that every possible code
+ * path from here forward frees all reserves including qgroup
+ * reserve.
+ */
locked_ref->must_insert_reserved = false;
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
- ret = run_one_delayed_ref(trans, ref, extent_op,
+ ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
must_insert_reserved);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+ *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
@@ -2031,15 +2055,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+ u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
unsigned long count = 0;
+ unsigned long max_count = 0;
+ u64 bytes_processed = 0;
delayed_refs = &trans->transaction->delayed_refs;
+ if (min_bytes == 0) {
+ max_count = delayed_refs->num_heads_ready;
+ min_bytes = U64_MAX;
+ }
+
do {
if (!locked_ref) {
locked_ref = btrfs_obtain_ref_head(trans);
@@ -2067,7 +2098,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
@@ -2079,7 +2110,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* Success, perform the usual cleanup of a processed
* head
*/
- ret = cleanup_ref_head(trans, locked_ref);
+ ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
@@ -2096,7 +2127,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
locked_ref = NULL;
cond_resched();
- } while ((nr != -1 && count < nr) || locked_ref);
+ } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
+ (max_count > 0 && count < max_count) ||
+ locked_ref);
return 0;
}
@@ -2145,24 +2178,25 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far. count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
+ * Start processing the delayed reference count updates and extent insertions
+ * we have queued up so far.
+ *
+ * @trans: Transaction handle.
+ * @min_bytes: How many bytes of delayed references to process. After this
+ * many bytes we stop processing delayed references if there are
+ * any more. If 0 it means to run all existing delayed references,
+ * but not new ones added after running all existing ones.
+ * Use (u64)-1 (U64_MAX) to run all existing delayed references
+ * plus any new ones that are added.
*
* Returns 0 on success or if called with an aborted transaction
* Returns <0 on error and aborts the transaction
*/
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long count)
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_head *head;
int ret;
- int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
if (TRANS_ABORTED(trans))
@@ -2172,42 +2206,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
return 0;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0)
- count = delayed_refs->num_heads_ready;
-
again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- ret = __btrfs_run_delayed_refs(trans, count);
+ ret = __btrfs_run_delayed_refs(trans, min_bytes);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- if (run_all) {
+ if (min_bytes == U64_MAX) {
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first_cached(&delayed_refs->href_root);
- if (!node) {
+ if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
spin_unlock(&delayed_refs->lock);
- goto out;
+ return 0;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
- /* Mutex was contended, block until it's released and retry. */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
-
- btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
-out:
+
return 0;
}
@@ -2215,7 +2237,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags)
{
struct btrfs_delayed_extent_op *extent_op;
- int level = btrfs_header_level(eb);
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
@@ -2225,9 +2246,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
+ btrfs_header_level(eb), extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -2239,7 +2260,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
struct rb_node *node;
@@ -2293,6 +2313,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
*/
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
+ u64 ref_owner;
+ u64 ref_offset;
+
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -2300,15 +2323,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
break;
}
- data_ref = btrfs_delayed_node_to_data_ref(ref);
+ ref_owner = btrfs_delayed_ref_owner(ref);
+ ref_offset = btrfs_delayed_ref_offset(ref);
/*
* If our ref doesn't match the one we're currently looking at
* then we have a cross reference.
*/
- if (data_ref->root != root->root_key.objectid ||
- data_ref->objectid != objectid ||
- data_ref->offset != offset) {
+ if (ref->ref_root != btrfs_root_id(root) ||
+ ref_owner != objectid || ref_offset != offset) {
ret = 1;
break;
}
@@ -2332,6 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_extent_item *ei;
struct btrfs_key key;
u32 item_size;
+ u32 expected_size;
int type;
int ret;
@@ -2342,7 +2366,14 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2358,10 +2389,22 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
+
+ /* No inline refs; we need to bail before checking for owner ref. */
+ if (item_size == sizeof(*ei))
+ goto out;
+
+ /* Check for an owner ref; skip over it to the real inline refs. */
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+ if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+ }
/* If extent item has more than 1 inline ref then it's shared */
- if (item_size != sizeof(*ei) +
- btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ if (item_size != expected_size)
goto out;
/*
@@ -2373,8 +2416,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_root_last_snapshot(&root->root_item)))
goto out;
- iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-
/* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
@@ -2383,8 +2424,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
- btrfs_extent_data_ref_root(leaf, ref) !=
- root->root_key.objectid ||
+ btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
goto out;
@@ -2421,14 +2461,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int full_backref, int inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 bytenr;
- u64 num_bytes;
u64 parent;
u64 ref_root;
u32 nritems;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref generic_ref = { 0 };
bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
int i;
int action;
@@ -2455,6 +2492,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
action = BTRFS_DROP_DELAYED_REF;
for (i = 0; i < nritems; i++) {
+ struct btrfs_ref ref = {
+ .action = action,
+ .parent = parent,
+ .ref_root = ref_root,
+ };
+
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
@@ -2464,34 +2507,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_type(buf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
- bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
- if (bytenr == 0)
+ ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (ref.bytenr == 0)
continue;
- num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.owning_root = ref_root;
+
key.offset -= btrfs_file_extent_offset(buf, fi);
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
- btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset, root->root_key.objectid,
- for_reloc);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
} else {
- bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = fs_info->nodesize;
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
- root->root_key.objectid, for_reloc);
+ /* We don't know the owning_root, leave as 0. */
+ ref.bytenr = btrfs_node_blockptr(buf, i);
+ ref.num_bytes = fs_info->nodesize;
+
+ btrfs_init_tree_ref(&ref, level - 1,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
}
@@ -2586,16 +2628,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * this function must be called within transaction
- */
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes)
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache)
return -EINVAL;
@@ -2607,10 +2646,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, eb->start, eb->len, 0);
/* remove us from the free space cache (if we're there at all) */
- ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+ ret = btrfs_remove_free_space(cache, eb->start, eb->len);
out:
btrfs_put_block_group(cache);
return ret;
@@ -2715,6 +2754,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
+ int ret = 0;
while (start <= end) {
readonly = false;
@@ -2724,7 +2764,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache); /* Logic error */
+ if (cache == NULL) {
+ /* Logic error, something removed the block group. */
+ ret = -EUCLEAN;
+ goto out;
+ }
cluster = fetch_cluster_info(fs_info,
cache->space_info,
@@ -2794,7 +2838,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
if (cache)
btrfs_put_block_group(cache);
- return 0;
+out:
+ return ret;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2824,7 +2869,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, &cached_state);
- unpin_extent_range(fs_info, start, end, true);
+ ret = unpin_extent_range(fs_info, start, end, true);
+ BUG_ON(ret);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
free_extent_state(cached_state);
cond_resched();
@@ -2851,7 +2897,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
block_group->length,
&trimmed);
+ /*
+ * Not strictly necessary to lock, as the block_group should be
+ * read-only from btrfs_delete_unused_bgs().
+ */
+ ASSERT(block_group->ro);
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -2866,12 +2920,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Parse an extent item's inline extents looking for a simple quotas owner ref.
+ *
+ * @fs_info: the btrfs_fs_info for this mount
+ * @leaf: a leaf in the extent tree containing the extent item
+ * @slot: the slot in the leaf where the extent item is found
+ *
+ * Returns the objectid of the root that originally allocated the extent item
+ * if the inline owner ref is expected and present, otherwise 0.
+ *
+ * If an extent item has an owner ref item, it will be the first inline ref
+ * item. Therefore the logic is to check whether there are any inline ref
+ * items, then check the type of the first one.
+ */
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_owner_ref *oref;
+ unsigned long ptr;
+ unsigned long end;
+ int type;
+
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
+ return 0;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + btrfs_item_size(leaf, slot);
+
+ /* No inline ref items of any kind, can't check type. */
+ if (ptr == end)
+ return 0;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+
+ /* We found an owner ref, get the root out of it. */
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ return btrfs_extent_owner_ref_root_id(leaf, oref);
+ }
+
+ /* We have inline refs, but not an owner ref. */
+ return 0;
+}
+
static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, bool is_data)
+ u64 bytenr, struct btrfs_squota_delta *delta)
{
int ret;
+ u64 num_bytes = delta->num_bytes;
- if (is_data) {
+ if (delta->is_data) {
struct btrfs_root *csum_root;
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
@@ -2880,6 +2983,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
+
+ ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = btrfs_record_squota_delta(trans->fs_info, delta);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
ret = add_to_free_space_tree(trans, bytenr, num_bytes);
@@ -2962,9 +3077,8 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
* And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -2979,11 +3093,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int refs_to_drop = node->ref_mod;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner_objectid = btrfs_delayed_ref_owner(node);
+ u64 owner_offset = btrfs_delayed_ref_offset(node);
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ u64 delayed_ref_root = href->owning_root;
extent_root = btrfs_extent_root(info, bytenr);
ASSERT(extent_root);
@@ -3007,7 +3125,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
skinny_metadata = false;
ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
- parent, root_objectid, owner_objectid,
+ node->parent, node->ref_root, owner_objectid,
owner_offset);
if (ret == 0) {
/*
@@ -3109,7 +3227,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else if (WARN_ON(ret == -ENOENT)) {
abort_and_dump(trans, path,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
- bytenr, parent, root_objectid, owner_objectid,
+ bytenr, node->parent, node->ref_root, owner_objectid,
owner_offset, path->slots[0]);
goto out;
} else {
@@ -3184,6 +3302,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ struct btrfs_squota_delta delta = {
+ .root = delayed_ref_root,
+ .num_bytes = num_bytes,
+ .is_data = is_data,
+ .is_inc = false,
+ .generation = btrfs_extent_generation(leaf, ei),
+ };
+
/* In this branch refs == 1 */
if (found_extent) {
if (is_data && refs_to_drop !=
@@ -3222,6 +3348,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
}
}
+ /*
+ * We can't infer the data owner from the delayed ref, so we need
+ * to try to get it from the owning ref item.
+ *
+ * If it is not present, then that extent was not written under
+ * simple quotas mode, so we don't need to account for its deletion.
+ */
+ if (is_data)
+ delta.root = btrfs_get_extent_owner_root(trans->fs_info,
+ leaf, extent_slot);
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
@@ -3231,7 +3367,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+ ret = do_free_extent_accounting(trans, bytenr, &delta);
}
btrfs_release_path(path);
@@ -3301,82 +3437,92 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_block_group *bg;
int ret;
- btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
- buf->start, buf->len, parent);
- btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root_id, 0, false);
-
if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = buf->start,
+ .num_bytes = buf->len,
+ .parent = parent,
+ .owning_root = btrfs_header_owner(buf),
+ .ref_root = root_id,
+ };
+
+ /*
+ * Assert that the extent buffer is not cleared due to
+ * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
+ * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
+ * detail.
+ */
+ ASSERT(btrfs_header_bytenr(buf) != 0);
+
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
if (ret < 0)
return ret;
}
- if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group *cache;
- bool must_pin = false;
-
- if (root_id != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, buf->start);
- if (!ret) {
- btrfs_redirty_list_add(trans->transaction, buf);
- goto out;
- }
- }
+ if (!last_ref)
+ return 0;
- cache = btrfs_lookup_block_group(fs_info, buf->start);
+ if (btrfs_header_generation(buf) != trans->transid)
+ goto out;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret)
goto out;
- }
+ }
- /*
- * If there are tree mod log users we may have recorded mod log
- * operations for this node. If we re-allocate this node we
- * could replay operations on this node that happened when it
- * existed in a completely different root. For example if it
- * was part of root A, then was reallocated to root B, and we
- * are doing a btrfs_old_search_slot(root b), we could replay
- * operations that happened when the block was part of root A,
- * giving us an inconsistent view of the btree.
- *
- * We are safe from races here because at this point no other
- * node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins we will not have an
- * existing log of operations on this node that we have to
- * contend with.
- */
- if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
- must_pin = true;
+ bg = btrfs_lookup_block_group(fs_info, buf->start);
- if (must_pin || btrfs_is_zoned(fs_info)) {
- btrfs_redirty_list_add(trans->transaction, buf);
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
- goto out;
- }
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
+ }
- WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
- btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_free_reserved_bytes(cache, buf->len, 0);
- btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+ || btrfs_is_zoned(fs_info)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
}
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(bg, buf->start, buf->len);
+ btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_put_block_group(bg);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
out:
- if (last_ref) {
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- }
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
return 0;
}
@@ -3393,12 +3539,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
*/
- if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
- /* unlocks the pinned mutex */
- btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
+ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
@@ -3406,10 +3548,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
- if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+ if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -3515,6 +3654,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3536,7 +3690,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -4092,21 +4247,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4490,8 +4630,8 @@ loop:
}
/*
- * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
- * hole that is at least as big as @num_bytes.
+ * Entry point to the extent allocator. Tries to find a hole that is at least
+ * as big as @num_bytes.
*
* @root - The root that will contain this extent
*
@@ -4544,7 +4684,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
@@ -4610,20 +4750,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
- u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(trans->fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache) {
btrfs_err(trans->fs_info, "unable to find block group for %llu",
- start);
+ eb->start);
return -ENOSPC;
}
- ret = pin_down_extent(trans, cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
btrfs_put_block_group(cache);
return ret;
}
@@ -4653,24 +4793,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod)
+ struct btrfs_key *ins, int ref_mod, u64 oref_root)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
int type;
u32 size;
+ const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
if (parent > 0)
type = BTRFS_SHARED_DATA_REF_KEY;
else
type = BTRFS_EXTENT_DATA_REF_KEY;
- size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+ size = sizeof(*extent_item);
+ if (simple_quota)
+ size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ size += btrfs_extent_inline_ref_size(type);
path = btrfs_alloc_path();
if (!path)
@@ -4692,7 +4837,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
flags | BTRFS_EXTENT_FLAG_DATA);
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ if (simple_quota) {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
+ iref = (struct btrfs_extent_inline_ref *)(oref + 1);
+ }
btrfs_set_extent_inline_ref_type(leaf, iref, type);
+
if (parent > 0) {
struct btrfs_shared_data_ref *ref;
ref = (struct btrfs_shared_data_ref *)(iref + 1);
@@ -4726,16 +4878,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
- struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 flags = extent_op->flags_to_set;
+ const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
+ /* The owner of a tree block is the level. */
+ int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
- ref = btrfs_delayed_node_to_tree_ref(node);
-
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
- extent_key.offset = ref->level;
+ /* The owner of a tree block is the level. */
+ extent_key.offset = level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
} else {
extent_key.offset = node->num_bytes;
@@ -4768,18 +4920,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
- btrfs_set_tree_block_level(leaf, block_info, ref->level);
+ btrfs_set_tree_block_level(leaf, block_info, level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
}
btrfs_mark_buffer_dirty(trans, leaf);
@@ -4793,14 +4945,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
- struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins->objectid,
+ .num_bytes = ins->offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
- BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ generic_ref.owning_root = root->relocation_src_root;
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
- offset, 0, false);
+ btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4819,6 +4977,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_squota_delta delta = {
+ .root = root_objectid,
+ .num_bytes = ins->offset,
+ .generation = trans->transid,
+ .is_data = true,
+ .is_inc = true,
+ };
/*
* Mixed block groups will exclude before processing the log so we only
@@ -4844,13 +5009,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
spin_unlock(&space_info->lock);
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
- offset, ins, 1);
+ offset, ins, 1, root_objectid);
if (ret)
btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_record_squota_delta(fs_info, &delta);
btrfs_put_block_group(block_group);
return ret;
}
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+ if (eb->lock_owner == current->pid) {
+ btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ eb->start, btrfs_header_owner(eb), current->pid);
+ return true;
+ }
+ return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+ return false;
+}
+#endif
+
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, int level, u64 owner,
@@ -4864,15 +5052,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- /*
- * Extra safety check in case the extent tree is corrupted and extent
- * allocator chooses to use a tree block which is already used and
- * locked.
- */
- if (buf->lock_owner == current->pid) {
- btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
- buf->start, btrfs_header_owner(buf), current->pid);
+ if (check_eb_lock_owner(buf)) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -4901,10 +5081,10 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
*/
btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
- __btrfs_tree_lock(buf, nest);
+ btrfs_tree_lock_nested(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
set_extent_buffer_uptodate(buf);
@@ -4916,7 +5096,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_owner(buf, owner);
write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
@@ -4949,18 +5129,18 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
- struct btrfs_delayed_extent_op *extent_op;
- struct btrfs_ref generic_ref = { 0 };
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ u64 owning_root;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
@@ -4987,42 +5167,54 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ret = PTR_ERR(buf);
goto out_free_reserved;
}
+ owning_root = btrfs_header_owner(buf);
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
parent = ins.objectid;
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ owning_root = reloc_src_root;
} else
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- extent_op = btrfs_alloc_delayed_extent_op();
- if (!extent_op) {
- ret = -ENOMEM;
- goto out_free_buf;
+ struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .parent = parent,
+ .owning_root = owning_root,
+ .ref_root = root_objectid,
+ };
+
+ if (!skinny_metadata || flags != 0) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = (skinny_metadata ? false : true);
+ extent_op->update_flags = (flags != 0);
+ } else {
+ extent_op = NULL;
}
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = skinny_metadata ? false : true;
- extent_op->update_flags = true;
- extent_op->level = level;
-
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins.objectid, ins.offset, parent);
- btrfs_init_tree_ref(&generic_ref, level, root_objectid,
- root->root_key.objectid, false);
+
+ btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
- if (ret)
- goto out_free_delayed;
+ if (ret) {
+ btrfs_free_delayed_extent_op(extent_op);
+ goto out_free_buf;
+ }
}
return buf;
-out_free_delayed:
- btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
@@ -5047,11 +5239,99 @@ struct walk_control {
int reada_slot;
int reada_count;
int restarted;
+ /* Indicate that extent info needs to be looked up when walking the tree. */
+ int lookup_info;
};
+/*
+ * This is our normal stage. We are traversing blocks the current snapshot owns
+ * and we are dropping any of our references to any children we are able to, and
+ * then freeing the block once we've processed all of the children.
+ */
#define DROP_REFERENCE 1
+
+/*
+ * We enter this stage when we have to walk into a child block (meaning we can't
+ * simply drop our reference to it from our current parent node) and there are
+ * more than one reference on it. If we are the owner of any of the children
+ * blocks from the current parent node then we have to do the FULL_BACKREF dance
+ * on them in order to drop our normal ref and add the shared ref.
+ */
#define UPDATE_BACKREF 2
+/*
+ * Decide if we need to walk down into this node to adjust the references.
+ *
+ * @root: the root we are currently deleting
+ * @wc: the walk control for this deletion
+ * @eb: the parent eb that we're currently visiting
+ * @refs: the number of refs for wc->level - 1
+ * @flags: the flags for wc->level - 1
+ * @slot: the slot in the eb that we're currently checking
+ *
+ * This is meant to be called when we're evaluating if a node we point to at
+ * wc->level should be read and walked into, or if we can simply delete our
+ * reference to it. We return true if we should walk into the node, false if we
+ * can skip it.
+ *
+ * We have assertions in here to make sure this is called correctly. We assume
+ * that sanity checking on the blocks read to this point has been done, so any
+ * corrupted file systems must have been caught before calling this function.
+ */
+static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
+ struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+{
+ struct btrfs_key key;
+ u64 generation;
+ int level = wc->level;
+
+ ASSERT(level > 0);
+ ASSERT(wc->refs[level - 1] > 0);
+
+ /*
+ * The update backref stage we only want to skip if we already have
+ * FULL_BACKREF set, otherwise we need to read.
+ */
+ if (wc->stage == UPDATE_BACKREF) {
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+ return true;
+ }
+
+ /*
+ * We're the last ref on this block, we must walk into it and process
+ * any refs it's pointing at.
+ */
+ if (wc->refs[level - 1] == 1)
+ return true;
+
+ /*
+ * If we're already FULL_BACKREF then we know we can just drop our
+ * current reference.
+ */
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+
+ /*
+ * This block is older than our creation generation, we can drop our
+ * reference to it.
+ */
+ generation = btrfs_node_ptr_generation(eb, slot);
+ if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
+ return false;
+
+ /*
+ * This block was processed from a previous snapshot deletion run, we
+ * can skip it.
+ */
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
+ return false;
+
+ /* All other cases we need to wander into the node. */
+ return true;
+}
+
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
@@ -5063,7 +5343,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 refs;
u64 flags;
u32 nritems;
- struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
@@ -5093,13 +5372,13 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
goto reada;
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset)
+ generation <= btrfs_root_origin_generation(root))
continue;
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
- &flags);
+ &flags, NULL);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -5113,26 +5392,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
if (refs == 0)
continue;
- if (wc->stage == DROP_REFERENCE) {
- if (refs == 1)
- goto reada;
-
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- continue;
- btrfs_node_key_to_cpu(eb, &key, slot);
- ret = btrfs_comp_cpu_keys(&key,
- &wc->update_progress);
- if (ret < 0)
- continue;
- } else {
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- }
+ /* If we don't need to visit this node don't reada. */
+ if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ continue;
reada:
btrfs_readahead_node_child(eb, slot);
nread++;
@@ -5151,7 +5413,7 @@ reada:
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
@@ -5159,22 +5421,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
- if (wc->stage == UPDATE_BACKREF &&
- btrfs_header_owner(eb) != root->root_key.objectid)
+ if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
return 1;
/*
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if (lookup_info &&
+ if (wc->lookup_info &&
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
ASSERT(path->locks[level]);
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret)
return ret;
if (unlikely(wc->refs[level] == 0)) {
@@ -5199,11 +5461,20 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
if (!(wc->flags[level] & flag)) {
ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
wc->flags[level] |= flag;
}
@@ -5226,23 +5497,188 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 parent,
int level)
{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *head;
struct btrfs_path *path;
struct btrfs_extent_inline_ref *iref;
int ret;
+ bool exists = false;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
+again:
ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent,
- root->root_key.objectid, level, 0);
+ btrfs_root_id(root), level, 0);
+ if (ret != -ENOENT) {
+ /*
+ * If we get 0 then we found our reference, return 1, else
+ * return the error if it's not -ENOENT;
+ */
+ btrfs_free_path(path);
+ return (ret < 0 ) ? ret : 1;
+ }
+
+ /*
+ * We could have a delayed ref with this reference, so look it up while
+ * we're holding the path open to make sure we don't race with the
+ * delayed ref running.
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (!head)
+ goto out;
+ if (!mutex_trylock(&head->mutex)) {
+ /*
+ * We're contended, means that the delayed ref is running, get a
+ * reference and wait for the ref head to be complete and then
+ * try again.
+ */
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ goto again;
+ }
+
+ exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ mutex_unlock(&head->mutex);
+out:
+ spin_unlock(&delayed_refs->lock);
btrfs_free_path(path);
- if (ret == -ENOENT)
+ return exists ? 1 : 0;
+}
+
+/*
+ * We may not have an uptodate block, so if we are going to walk down into this
+ * block we need to drop the lock, read it off of the disk, re-lock it and
+ * return to continue dropping the snapshot.
+ */
+static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc,
+ struct extent_buffer *next)
+{
+ struct btrfs_tree_parent_check check = { 0 };
+ u64 generation;
+ int level = wc->level;
+ int ret;
+
+ btrfs_assert_tree_write_locked(next);
+
+ generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
+
+ if (btrfs_buffer_uptodate(next, generation, 0))
return 0;
- if (ret < 0)
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = btrfs_root_id(root);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
+
+ btrfs_tree_unlock(next);
+ if (level == 1)
+ reada_walk_down(trans, root, wc, path);
+ ret = btrfs_read_extent_buffer(next, &check);
+ if (ret) {
+ free_extent_buffer(next);
return ret;
- return 1;
+ }
+ btrfs_tree_lock(next);
+ wc->lookup_info = 1;
+ return 0;
+}
+
+/*
+ * If we determine that we don't have to visit wc->level - 1 then we need to
+ * determine if we can drop our reference.
+ *
+ * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
+ *
+ * If we are DROP_REFERENCE this will figure out if we need to drop our current
+ * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * dropping it if we still have a reference to it.
+ */
+static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, struct walk_control *wc,
+ struct extent_buffer *next, u64 owner_root)
+{
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = next->start,
+ .num_bytes = root->fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
+ int level = wc->level;
+ int ret;
+
+ /* We are UPDATE_BACKREF, we're not dropping anything. */
+ if (wc->stage == UPDATE_BACKREF)
+ return 0;
+
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ ref.parent = path->nodes[level]->start;
+ } else {
+ ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
+ if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info, "mismatched block owner");
+ return -EIO;
+ }
+ }
+
+ /*
+ * If we had a drop_progress we need to verify the refs are set as
+ * expected. If we find our ref then we know that from here on out
+ * everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, next->start, ref.parent,
+ level - 1);
+ if (ret <= 0)
+ return ret;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have already
+ * accounted them at merge time (replace_path), thus we could skip
+ * expensive subtree trace here.
+ */
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ wc->refs[level - 1] > 1) {
+ u64 generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+
+ ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
+ if (ret) {
+ btrfs_err_rl(root->fs_info,
+"error %d accounting shared subtree, quota is out of sync, rescan required",
+ ret);
+ }
+ }
+
+ /*
+ * We need to update the next key in our walk control so we can update
+ * the drop_progress key accordingly. We don't care if find_next_key
+ * doesn't find a key because that means we're at the end and are going
+ * to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
+ return btrfs_free_extent(trans, &ref);
}
/*
@@ -5261,20 +5697,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int *lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
- u64 parent;
- struct btrfs_tree_parent_check check = { 0 };
- struct btrfs_key key;
- struct btrfs_ref ref = { 0 };
+ u64 owner_root = 0;
struct extent_buffer *next;
int level = wc->level;
- int reada = 0;
int ret = 0;
- bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -5284,33 +5715,24 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset) {
- *lookup_info = 1;
+ generation <= btrfs_root_origin_generation(root)) {
+ wc->lookup_info = 1;
return 1;
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- check.level = level - 1;
- check.transid = generation;
- check.owner_root = root->root_key.objectid;
- check.has_first_key = true;
- btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
- path->slots[level]);
+ next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
+ level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
- next = find_extent_buffer(fs_info, bytenr);
- if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr,
- root->root_key.objectid, level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
- reada = 1;
- }
btrfs_tree_lock(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
- &wc->flags[level - 1]);
+ &wc->flags[level - 1],
+ &owner_root);
if (ret < 0)
goto out_unlock;
@@ -5320,53 +5742,27 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = -EUCLEAN;
goto out_unlock;
}
- *lookup_info = 0;
+ wc->lookup_info = 0;
- if (wc->stage == DROP_REFERENCE) {
- if (wc->refs[level - 1] > 1) {
- need_account = true;
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
-
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- goto skip;
-
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
- if (ret < 0)
- goto skip;
+ /* If we don't have to walk into this node skip it. */
+ if (!visit_node_for_delete(root, wc, path->nodes[level],
+ wc->refs[level - 1], wc->flags[level - 1],
+ path->slots[level]))
+ goto skip;
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level - 1;
- }
- } else {
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
+ /*
+ * We have to walk down into this node, and if we're currently at the
+ * DROP_REFERNCE stage and this block is shared then we need to switch
+ * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
+ */
+ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
}
- if (!btrfs_buffer_uptodate(next, generation, 0)) {
- btrfs_tree_unlock(next);
- free_extent_buffer(next);
- next = NULL;
- *lookup_info = 1;
- }
-
- if (!next) {
- if (reada && level == 1)
- reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, &check);
- if (IS_ERR(next)) {
- return PTR_ERR(next);
- } else if (!extent_buffer_uptodate(next)) {
- free_extent_buffer(next);
- return -EIO;
- }
- btrfs_tree_lock(next);
- }
+ ret = check_next_block_uptodate(trans, root, path, wc, next);
+ if (ret)
+ return ret;
level--;
ASSERT(level == btrfs_header_level(next));
@@ -5383,76 +5779,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
wc->reada_slot = 0;
return 0;
skip:
+ ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
+ if (ret)
+ goto out_unlock;
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
- if (wc->stage == DROP_REFERENCE) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- parent = path->nodes[level]->start;
- } else {
- ASSERT(root->root_key.objectid ==
- btrfs_header_owner(path->nodes[level]));
- if (root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level])) {
- btrfs_err(root->fs_info,
- "mismatched block owner");
- ret = -EIO;
- goto out_unlock;
- }
- parent = 0;
- }
-
- /*
- * If we had a drop_progress we need to verify the refs are set
- * as expected. If we find our ref then we know that from here
- * on out everything should be correct, and we can clear the
- * ->restarted flag.
- */
- if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, parent,
- level - 1);
- if (ret < 0)
- goto out_unlock;
- if (ret == 0)
- goto no_delete;
- ret = 0;
- wc->restarted = 0;
- }
-
- /*
- * Reloc tree doesn't contribute to qgroup numbers, and we have
- * already accounted them at merge time (replace_path),
- * thus we could skip expensive subtree trace here.
- */
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- need_account) {
- ret = btrfs_qgroup_trace_subtree(trans, next,
- generation, level - 1);
- if (ret) {
- btrfs_err_rl(fs_info,
- "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
- ret);
- }
- }
-
- /*
- * We need to update the next key in our walk control so we can
- * update the drop_progress key accordingly. We don't care if
- * find_next_key doesn't find a key because that means we're at
- * the end and are going to clean up now.
- */
- wc->drop_level = level;
- find_next_key(path, level, &wc->drop_progress);
-
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
- 0, false);
- ret = btrfs_free_extent(trans, &ref);
- if (ret)
- goto out_unlock;
- }
-no_delete:
- *lookup_info = 1;
+ wc->lookup_info = 1;
ret = 1;
out_unlock:
@@ -5486,7 +5818,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
u64 parent = 0;
if (wc->stage == UPDATE_BACKREF) {
- BUG_ON(wc->shared_level < level);
+ ASSERT(wc->shared_level >= level);
if (level < wc->shared_level)
goto out;
@@ -5504,14 +5836,15 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
* count is one.
*/
if (!path->locks[level]) {
- BUG_ON(level == 0);
+ ASSERT(level > 0);
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5532,7 +5865,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
/* wc->stage == DROP_REFERENCE */
- BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+ ASSERT(path->locks[level] || wc->refs[level] == 1);
if (wc->refs[level] == 1) {
if (level == 0) {
@@ -5540,8 +5873,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
- if (is_fstree(root->root_key.objectid)) {
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -5561,12 +5897,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (root->root_key.objectid != btrfs_header_owner(eb))
+ else if (btrfs_root_id(root) != btrfs_header_owner(eb))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (root->root_key.objectid !=
+ else if (btrfs_root_id(root) !=
btrfs_header_owner(path->nodes[level + 1]))
goto owner_mismatch;
}
@@ -5582,21 +5918,42 @@ out:
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
- btrfs_header_owner(eb), root->root_key.objectid);
+ btrfs_header_owner(eb), btrfs_root_id(root));
return -EUCLEAN;
}
+/*
+ * walk_down_tree consists of two steps.
+ *
+ * walk_down_proc(). Look up the reference count and reference of our current
+ * wc->level. At this point path->nodes[wc->level] should be populated and
+ * uptodate, and in most cases should already be locked. If we are in
+ * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
+ * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
+ * FULL_BACKREF on this node if it's not already set, and then do the
+ * FULL_BACKREF conversion dance, which is to drop the root reference and add
+ * the shared reference to all of this nodes children.
+ *
+ * do_walk_down(). This is where we actually start iterating on the children of
+ * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
+ * our reference to the children that return false from visit_node_for_delete(),
+ * which has various conditions where we know we can just drop our reference
+ * without visiting the node. For UPDATE_BACKREF we will skip any children that
+ * visit_node_for_delete() returns false for, only walking down when necessary.
+ * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
+ * snapshot deletion.
+ */
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
int level = wc->level;
- int lookup_info = 1;
int ret = 0;
+ wc->lookup_info = 1;
while (level >= 0) {
- ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ ret = walk_down_proc(trans, root, path, wc);
if (ret)
break;
@@ -5607,7 +5964,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
btrfs_header_nritems(path->nodes[level]))
break;
- ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ ret = do_walk_down(trans, root, path, wc);
if (ret > 0) {
path->slots[level]++;
continue;
@@ -5618,6 +5975,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
return (ret == 1) ? 0 : ret;
}
+/*
+ * walk_up_tree() is responsible for making sure we visit every slot on our
+ * current node, and if we're at the end of that node then we call
+ * walk_up_proc() on our current node which will do one of a few things based on
+ * our stage.
+ *
+ * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
+ * then we need to walk back up the tree, and then going back down into the
+ * other slots via walk_down_tree to update any other children from our original
+ * wc->shared_level. Once we're at or above our wc->shared_level we can switch
+ * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
+ *
+ * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
+ * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
+ * in our current leaf. After that we call btrfs_free_tree_block() on the
+ * current node and walk up to the next node to walk down the next slot.
+ */
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -5668,8 +6042,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
- const bool is_reloc_root = (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID);
+ const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5677,24 +6050,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_root_item *root_item = &root->root_item;
struct walk_control *wc;
struct btrfs_key key;
- int err = 0;
- int ret;
+ const u64 rootid = btrfs_root_id(root);
+ int ret = 0;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -5707,12 +6080,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
- err = btrfs_run_delayed_items(trans);
- if (err)
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
goto out_end_trans;
/*
@@ -5743,11 +6116,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->lowest_level = 0;
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
WARN_ON(ret > 0);
+ ret = 0;
/*
* unlock our path, this is safe because only this
@@ -5760,14 +6133,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_tree_lock(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK;
+ /*
+ * btrfs_lookup_extent_info() returns 0 for success,
+ * or < 0 for error.
+ */
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
- &wc->flags[level]);
- if (ret < 0) {
- err = ret;
+ &wc->flags[level], NULL);
+ if (ret < 0)
goto out_end_trans;
- }
+
BUG_ON(wc->refs[level] == 0);
if (level == btrfs_root_drop_level(root_item))
@@ -5793,19 +6169,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
if (ret > 0) {
BUG_ON(wc->stage != DROP_REFERENCE);
+ ret = 0;
break;
}
@@ -5827,7 +6202,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
root_item);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -5838,7 +6212,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out_free;
}
@@ -5852,19 +6226,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
}
}
btrfs_release_path(path);
- if (err)
+ if (ret)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -5873,16 +6246,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
} else if (ret > 0) {
- /* if we fail to delete the orphan item this time
+ ret = 0;
+ /*
+ * If we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
* The most common failure here is just -ENOENT.
*/
- btrfs_del_orphan_item(trans, tree_root,
- root->root_key.objectid);
+ btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
}
}
@@ -5908,11 +6281,19 @@ out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ if (!ret && root_dropped) {
+ ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
+ if (ret < 0)
+ btrfs_warn_rl(fs_info,
+ "failed to cleanup qgroup 0/%llu: %d",
+ rootid, ret);
+ ret = 0;
+ }
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
- if (!err && unfinished_drop)
+ if (!ret && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
@@ -5924,7 +6305,7 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- return err;
+ return ret;
}
/*
@@ -5944,9 +6325,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
int level;
int parent_level;
int ret = 0;
- int wret;
- BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -5980,17 +6360,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
- wret = walk_down_tree(trans, root, path, wc);
- if (wret < 0) {
- ret = wret;
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0)
break;
- }
- wret = walk_up_tree(trans, root, path, wc, parent_level);
- if (wret < 0)
- ret = wret;
- if (wret != 0)
+ ret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
break;
+ }
}
kfree(wc);
@@ -5998,10 +6377,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end)
+/*
+ * Unpin the extent range in an error context and don't add the space back.
+ * Errors are not propagated further.
+ */
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
{
- return unpin_extent_range(fs_info, start, end, false);
+ unpin_extent_range(fs_info, start, end, false);
}
/*
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index ef1c1c99294e..2ad51130c037 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,10 +3,21 @@
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
+#include <linux/types.h>
#include "misc.h"
#include "block-group.h"
+#include "locking.h"
+struct extent_buffer;
struct btrfs_free_cluster;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_ref;
+struct btrfs_disk_key;
+struct btrfs_delayed_ref_head;
+struct btrfs_delayed_ref_root;
+struct btrfs_extent_inline_ref;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -91,18 +102,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes);
+ const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
@@ -113,6 +125,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest);
int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
@@ -136,12 +149,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b2ae50dcca0f..d322cf82783f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,16 +14,13 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
-#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
-#include "check-integrity.h"
#include "locking.h"
-#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
@@ -79,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
pr_err(
- "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
@@ -103,6 +101,13 @@ struct btrfs_bio_ctrl {
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
+
+ /*
+ * The sectors of the page which are going to be submitted by
+ * extent_writepage_io().
+ * This is to avoid touching ranges covered by compression/inline.
+ */
+ unsigned long submit_bitmap;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -119,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
btrfs_submit_compressed_read(bbio);
else
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
@@ -148,8 +153,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
- sizeof(struct extent_buffer), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_buffer), 0, 0,
+ NULL);
if (!extent_buffer_cache)
return -ENOMEM;
@@ -166,24 +171,9 @@ void __cold extent_buffer_free_cachep(void)
kmem_cache_destroy(extent_buffer_cache);
}
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- clear_page_dirty_for_io(page);
- put_page(page);
- index++;
- }
-}
-
-static void process_one_page(struct btrfs_fs_info *fs_info,
- struct page *page, struct page *locked_page,
- unsigned long page_ops, u64 start, u64 end)
+static void process_one_folio(struct btrfs_fs_info *fs_info,
+ struct folio *folio, const struct folio *locked_folio,
+ unsigned long page_ops, u64 start, u64 end)
{
u32 len;
@@ -191,23 +181,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
- btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
- btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
- btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
- btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
- if (page != locked_page && (page_ops & PAGE_UNLOCK))
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
-static void __process_pages_contig(struct address_space *mapping,
- struct page *locked_page, u64 start, u64 end,
- unsigned long page_ops)
+static void __process_folios_contig(struct address_space *mapping,
+ const struct folio *locked_folio, u64 start,
+ u64 end, unsigned long page_ops)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
@@ -223,35 +213,34 @@ static void __process_pages_contig(struct address_space *mapping,
for (i = 0; i < found_folios; i++) {
struct folio *folio = fbatch.folios[i];
- process_one_page(fs_info, &folio->page, locked_page,
- page_ops, start, end);
+ process_one_folio(fs_info, folio, locked_folio,
+ page_ops, start, end);
}
folio_batch_release(&fbatch);
cond_resched();
}
}
-static noinline void __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
+static noinline void __unlock_for_delalloc(const struct inode *inode,
+ const struct folio *locked_folio,
u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- ASSERT(locked_page);
- if (index == locked_page->index && end_index == index)
+ ASSERT(locked_folio);
+ if (index == locked_folio->index && end_index == index)
return;
- __process_pages_contig(inode->i_mapping, locked_page, start, end,
- PAGE_UNLOCK);
+ __process_folios_contig(inode->i_mapping, locked_folio, start, end,
+ PAGE_UNLOCK);
}
-static noinline int lock_delalloc_pages(struct inode *inode,
- struct page *locked_page,
- u64 start,
- u64 end)
+static noinline int lock_delalloc_folios(struct inode *inode,
+ const struct folio *locked_folio,
+ u64 start, u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
@@ -259,7 +248,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 processed_end = start;
struct folio_batch fbatch;
- if (index == locked_page->index && index == end_index)
+ if (index == locked_folio->index && index == end_index)
return 0;
folio_batch_init(&fbatch);
@@ -272,23 +261,24 @@ static noinline int lock_delalloc_pages(struct inode *inode,
goto out;
for (i = 0; i < found_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
- u32 len = end + 1 - start;
+ struct folio *folio = fbatch.folios[i];
+ u64 range_start;
+ u32 range_len;
- if (page == locked_page)
+ if (folio == locked_folio)
continue;
- if (btrfs_page_start_writer_lock(fs_info, page, start,
- len))
- goto out;
-
- if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start,
- len);
+ folio_lock(folio);
+ if (!folio_test_dirty(folio) || folio->mapping != mapping) {
+ folio_unlock(folio);
goto out;
}
+ range_start = max_t(u64, folio_pos(folio), start);
+ range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ end + 1) - range_start;
+ btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
- processed_end = page_offset(page) + PAGE_SIZE - 1;
+ processed_end = range_start + range_len - 1;
}
folio_batch_release(&fbatch);
cond_resched();
@@ -298,7 +288,8 @@ static noinline int lock_delalloc_pages(struct inode *inode,
out:
folio_batch_release(&fbatch);
if (processed_end > start)
- __unlock_for_delalloc(inode, locked_page, start, processed_end);
+ __unlock_for_delalloc(inode, locked_folio, start,
+ processed_end);
return -EAGAIN;
}
@@ -319,10 +310,10 @@ out:
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
- struct page *locked_page, u64 *start,
- u64 *end)
+ struct folio *locked_folio,
+ u64 *start, u64 *end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
@@ -338,9 +329,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
/* Caller should pass a valid @end to indicate the search range end */
ASSERT(orig_end > orig_start);
- /* The range should at least cover part of the page */
- ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
- orig_end <= page_offset(locked_page)));
+ /* The range should at least cover part of the folio */
+ ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
@@ -357,25 +348,25 @@ again:
}
/*
- * start comes from the offset of locked_page. We have to lock
- * pages in order, so we can't process delalloc bytes before
- * locked_page
+ * start comes from the offset of locked_folio. We have to lock
+ * folios in order, so we can't process delalloc bytes before
+ * locked_folio
*/
if (delalloc_start < *start)
delalloc_start = *start;
/*
- * make sure to limit the number of pages we try to lock down
+ * make sure to limit the number of folios we try to lock down
*/
if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
- /* step two, lock all the pages after the page that has start */
- ret = lock_delalloc_pages(inode, locked_page,
- delalloc_start, delalloc_end);
+ /* step two, lock all the folioss after the folios that has start */
+ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
+ delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the pages are gone, lets avoid looping by
+ /* some of the folios are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
@@ -395,16 +386,15 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1, cached_state);
+ EXTENT_DELALLOC, cached_state);
+
+ unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end,
- &cached_state);
- __unlock_for_delalloc(inode, locked_page,
- delalloc_start, delalloc_end);
+ __unlock_for_delalloc(inode, locked_folio, delalloc_start,
+ delalloc_end);
cond_resched();
goto again;
}
- free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -412,280 +402,236 @@ out_failed:
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct folio *locked_folio,
+ struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
- __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
- start, end, page_ops);
+ __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
+ end, page_ops);
}
-static bool btrfs_verify_page(struct page *page, u64 start)
+static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len)
{
- if (!fsverity_active(page->mapping->host) ||
- PageUptodate(page) ||
- start >= i_size_read(page->mapping->host))
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+
+ if (!fsverity_active(folio->mapping->host) ||
+ btrfs_folio_test_uptodate(fs_info, folio, start, len) ||
+ start >= i_size_read(folio->mapping->host))
return true;
- return fsverity_verify_page(page);
+ return fsverity_verify_folio(folio);
}
-static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
- if (uptodate && btrfs_verify_page(page, start))
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ if (uptodate && btrfs_verify_folio(folio, start, len))
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page))
- unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
+ folio_unlock(folio);
else
- btrfs_subpage_end_reader(fs_info, page, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback() if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
- u64 start = page_offset(page) + bvec->bv_offset;
- u32 len = bvec->bv_len;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ u64 start = folio_pos(folio) + fi.offset;
+ u32 len = fi.length;
+
+ /* Only order 0 (single page) folios are allowed for data. */
+ ASSERT(folio_order(folio) == 0);
/* Our read/write should always be sector aligned. */
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ "partial page write in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page write with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page write with offset %zu and length %zu",
+ fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered, folio, start, len,
+ !error);
if (error)
- mapping_set_error(page->mapping, error);
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ mapping_set_error(folio->mapping, error);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
bio_put(bio);
}
-/*
- * Record previously processed extent range
- *
- * For endio_readpage_release_extent() to handle a full extent range, reducing
- * the extent io operations.
- */
-struct processed_extent {
- struct btrfs_inode *inode;
- /* Start of the range in @inode */
- u64 start;
- /* End of the range in @inode */
- u64 end;
- bool uptodate;
-};
-
-/*
- * Try to release processed extent range
- *
- * May not release the extent range right now if the current range is
- * contiguous to processed extent.
- *
- * Will release processed extent when any of @inode, @uptodate, the range is
- * no longer contiguous to the processed range.
- *
- * Passing @inode == NULL will force processed extent to be released.
- */
-static void endio_readpage_release_extent(struct processed_extent *processed,
- struct btrfs_inode *inode, u64 start, u64 end,
- bool uptodate)
-{
- struct extent_state *cached = NULL;
- struct extent_io_tree *tree;
-
- /* The first extent, initialize @processed */
- if (!processed->inode)
- goto update;
-
- /*
- * Contiguous to processed extent, just uptodate the end.
- *
- * Several things to notice:
- *
- * - bio can be merged as long as on-disk bytenr is contiguous
- * This means we can have page belonging to other inodes, thus need to
- * check if the inode still matches.
- * - bvec can contain range beyond current page for multi-page bvec
- * Thus we need to do processed->end + 1 >= start check
- */
- if (processed->inode == inode && processed->uptodate == uptodate &&
- processed->end + 1 >= start && end >= processed->end) {
- processed->end = end;
- return;
- }
-
- tree = &processed->inode->io_tree;
- /*
- * Now we don't have range contiguous to the processed range, release
- * the processed range now.
- */
- unlock_extent(tree, processed->start, processed->end, &cached);
-
-update:
- /* Update processed to current range */
- processed->inode = inode;
- processed->start = start;
- processed->end = end;
- processed->uptodate = uptodate;
-}
-
-static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
- ASSERT(PageLocked(page));
- if (!btrfs_is_subpage(fs_info, page))
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page));
- btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(folio_test_private(folio));
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
}
/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
- struct processed_extent processed = { 0 };
- /*
- * The offset to the beginning of a bio, since one bio can never be
- * larger than UINT_MAX, u32 here is enough.
- */
- u32 bio_offset = 0;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
u64 start;
u64 end;
u32 len;
btrfs_debug(fs_info,
- "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- bio->bi_iter.bi_sector, bio->bi_status,
+ "%s: bi_sector=%llu, err=%d, mirror=%u",
+ __func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
/*
* We always issue full-sector reads, but if some block in a
- * page fails to read, blk_update_request() will advance
+ * folio fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
- sectorsize))
+ "partial page read in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page read with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page read with offset %zu and length %zu",
+ fi.offset, fi.length);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
- len = bvec->bv_len;
+ start = folio_pos(folio) + fi.offset;
+ end = start + fi.length - 1;
+ len = fi.length;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
/*
* Zero out the remaining part if this range straddles
* i_size.
*
- * Here we should only zero the range inside the bvec,
+ * Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (page->index == end_index && i_size <= end) {
- u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(start));
-
- zero_user_segment(page, zero_start,
- offset_in_page(end) + 1);
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
+ u32 zero_start = max(offset_in_folio(folio, i_size),
+ offset_in_folio(folio, start));
+ u32 zero_len = offset_in_folio(folio, end) + 1 -
+ zero_start;
+
+ folio_zero_range(folio, zero_start, zero_len);
}
}
/* Update page status and unlock. */
- end_page_read(page, uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, uptodate);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
-
+ end_folio_read(folio, uptodate, start, len);
}
- /* Release the last extent */
- endio_readpage_release_extent(&processed, NULL, 0, 0, false);
bio_put(bio);
}
/*
- * Populate every free slot in a provided array with pages.
+ * Populate every free slot in a provided array with folios using GFP_NOFS.
+ *
+ * @nr_folios: number of folios to allocate
+ * @folio_array: the array to fill with folios; any existing non-NULL entries in
+ * the array will be skipped
+ *
+ * Return: 0 if all folios were able to be allocated;
+ * -ENOMEM otherwise, the partially allocated folios would be freed and
+ * the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
+{
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ continue;
+ folio_array[i] = folio_alloc(GFP_NOFS, 0);
+ if (!folio_array[i])
+ goto error;
+ }
+ return 0;
+error:
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ folio_put(folio_array[i]);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Populate every free slot in a provided array with pages, using GFP_NOFS.
*
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
- * the array will be skipped
+ * the array will be skipped
+ * @nofail: whether using __GFP_NOFAIL flag
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ bool nofail)
{
+ const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+ allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
if (unlikely(allocated == last)) {
/* No progress, fail and do cleanup. */
for (int i = 0; i < allocated; i++) {
@@ -698,13 +644,36 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
+{
+ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+ int num_pages = num_extent_pages(eb);
+ int ret;
+
+ ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < num_pages; i++)
+ eb->folios[i] = page_folio(page_array[i]);
+ eb->folio_size = PAGE_SIZE;
+ eb->folio_shift = PAGE_SHIFT;
+ return 0;
+}
+
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page, u64 disk_bytenr,
+ struct folio *folio, u64 disk_bytenr,
unsigned int pg_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ struct folio *bv_folio = page_folio(bvec->bv_page);
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
@@ -717,7 +686,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
/*
* The contig check requires the following conditions to be met:
*
- * 1) The pages are belonging to the same inode
+ * 1) The folios are belonging to the same inode
* This is implied by the call chain.
*
* 2) The range has adjacent logical bytenr
@@ -726,8 +695,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
* This is required for the usage of btrfs_bio->file_offset.
*/
return bio_end_sector(bio) == sector &&
- page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
- page_offset(page) + pg_offset;
+ folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len ==
+ folio_pos(folio) + pg_offset;
}
static void alloc_new_bio(struct btrfs_inode *inode,
@@ -780,17 +749,17 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* The mirror number for this IO should already be initizlied in
* @bio_ctrl->mirror_num.
*/
-static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
- u64 disk_bytenr, struct page *page,
+static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct folio *folio,
size_t size, unsigned long pg_offset)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(folio);
ASSERT(pg_offset + size <= PAGE_SIZE);
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
- !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
+ !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset))
submit_one_bio(bio_ctrl);
do {
@@ -799,31 +768,32 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
/* Allocate new bio if needed */
if (!bio_ctrl->bbio) {
alloc_new_bio(inode, bio_ctrl, disk_bytenr,
- page_offset(page) + pg_offset);
+ folio_pos(folio) + pg_offset);
}
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
- ASSERT(is_data_inode(&inode->vfs_inode));
+ ASSERT(is_data_inode(inode));
len = bio_ctrl->len_to_oe_boundary;
}
- if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
+ if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) {
/* bio full: move on to a new one */
submit_one_bio(bio_ctrl);
continue;
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
+ len);
size -= len;
pg_offset += len;
disk_bytenr += len;
/*
- * len_to_oe_boundary defaults to U32_MAX, which isn't page or
+ * len_to_oe_boundary defaults to U32_MAX, which isn't folio or
* sector aligned. alloc_new_bio() then sets it to the end of
* our ordered extent for writes into zoned devices.
*
@@ -833,15 +803,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
* boundary is correct.
*
* When len_to_oe_boundary is U32_MAX, the cap above would
- * result in a 4095 byte IO for the last page right before
- * we hit the bio limit of UINT_MAX. bio_add_page() has all
+ * result in a 4095 byte IO for the last folio right before
+ * we hit the bio limit of UINT_MAX. bio_add_folio() has all
* the checks required to make sure we don't overflow the bio,
* and we should just ignore len_to_oe_boundary completely
* unless we're using it to track an ordered extent.
*
* It's pretty hard to make a bio sized U32_MAX, but it can
* happen when the page cache is able to feed us contiguous
- * pages for large extents.
+ * folios for large extents.
*/
if (bio_ctrl->len_to_oe_boundary != U32_MAX)
bio_ctrl->len_to_oe_boundary -= len;
@@ -852,9 +822,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
} while (size);
}
-static int attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page,
- struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+ struct folio *folio,
+ struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -865,74 +835,80 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
- if (page->mapping)
- lockdep_assert_held(&page->mapping->private_lock);
+ if (folio->mapping)
+ lockdep_assert_held(&folio->mapping->i_private_lock);
if (fs_info->nodesize >= PAGE_SIZE) {
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, eb);
else
- WARN_ON(page->private != (unsigned long)eb);
+ WARN_ON(folio_get_private(folio) != eb);
return 0;
}
/* Already mapped, just free prealloc */
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
btrfs_free_subpage(prealloc);
return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
- attach_page_private(page, prealloc);
+ folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, page,
- BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
+ return set_folio_extent_mapped(page_folio(page));
+}
+
+int set_folio_extent_mapped(struct folio *folio)
+{
struct btrfs_fs_info *fs_info;
- ASSERT(page->mapping);
+ ASSERT(folio->mapping);
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
- fs_info = btrfs_sb(page->mapping->host->i_sb);
+ fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, folio->mapping))
+ return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}
-void clear_page_extent_mapped(struct page *page)
+void clear_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;
- ASSERT(page->mapping);
+ ASSERT(folio->mapping);
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return;
- fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_detach_subpage(fs_info, page);
+ fs_info = folio_to_fs_info(folio);
+ if (btrfs_is_subpage(fs_info, folio->mapping))
+ return btrfs_detach_subpage(fs_info, folio);
- detach_page_private(page);
+ folio_detach_private(folio);
}
-static struct extent_map *
-__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, struct extent_map **em_cached)
+static struct extent_map *get_extent_map(struct btrfs_inode *inode,
+ struct folio *folio, u64 start,
+ u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- if (em_cached && *em_cached) {
+ ASSERT(em_cached);
+
+ if (*em_cached) {
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
@@ -944,12 +920,13 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR(em)) {
+ em = btrfs_get_extent(inode, folio, start, len);
+ if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
+
return em;
}
/*
@@ -959,12 +936,12 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 start = page_offset(page);
+ struct inode *inode = folio->mapping->host;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ u64 start = folio_pos(folio);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
@@ -975,25 +952,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = fs_info->sectorsize;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_extent(tree, start, end, NULL);
- unlock_page(page);
+ folio_unlock(folio);
return ret;
}
- if (page->index == last_byte >> PAGE_SHIFT) {
- size_t zero_offset = offset_in_page(last_byte);
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
+ size_t zero_offset = offset_in_folio(folio, last_byte);
if (zero_offset) {
- iosize = PAGE_SIZE - zero_offset;
- memzero_page(page, zero_offset, iosize);
+ iosize = folio_size(folio) - zero_offset;
+ folio_zero_range(folio, zero_offset, iosize);
}
}
- bio_ctrl->end_io_func = end_bio_extent_readpage;
- begin_page_read(fs_info, page);
+ bio_ctrl->end_io_func = end_bbio_data_read;
+ begin_folio_read(fs_info, folio);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
bool force_bio_submit = false;
@@ -1001,34 +976,30 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = PAGE_SIZE - pg_offset;
- memzero_page(page, pg_offset, iosize);
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ iosize = folio_size(folio) - pg_offset;
+ folio_zero_range(folio, pg_offset, iosize);
+ end_folio_read(folio, true, cur, iosize);
break;
}
- em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, em_cached);
+ em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
- unlock_extent(tree, cur, end, NULL);
- end_page_read(page, false, cur, end + 1 - cur);
+ end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
}
extent_offset = cur - em->start;
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- compress_type = em->compress_type;
+ compress_type = extent_map_compression(em);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
- disk_bytenr = em->block_start;
+ disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = em->block_start + extent_offset;
- block_start = em->block_start;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = extent_map_block_start(em);
+ if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
/*
@@ -1037,8 +1008,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
- * single bio to populate the pages for the 2 ranges because
- * this makes the compressed extent read zero out the pages
+ * single bio to populate the folios for the 2 ranges because
+ * this makes the compressed extent read zero out the folios
* belonging to the 2nd range. Imagine the following scenario:
*
* File layout
@@ -1051,13 +1022,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* [extent X, compressed length = 4K uncompressed length = 16K]
*
* If the bio to read the compressed extent covers both ranges,
- * it will decompress extent X into the pages belonging to the
+ * it will decompress extent X into the folios belonging to the
* first range and then it will stop, zeroing out the remaining
- * pages that belong to the other range that points to extent X.
+ * folios that belong to the other range that points to extent X.
* So here we make sure we submit 2 bios, one for the first
* range and another one for the third range. Both will target
* the same physical extent from disk, but we can't currently
- * make the compressed bio endio callback populate the pages
+ * make the compressed bio endio callback populate the folios
* for both ranges because each compressed bio is tightly
* coupled with a single extent map, and each range can have
* an extent map with a different offset value relative to the
@@ -1065,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ if (compress_type != BTRFS_COMPRESS_NONE &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
@@ -1078,18 +1049,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- memzero_page(page, pg_offset, iosize);
+ folio_zero_range(folio, pg_offset, iosize);
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ end_folio_read(folio, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- /* the get_extent function already copied into the page */
+ /* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ end_folio_read(folio, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -1102,8 +1071,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
- pg_offset);
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ pg_offset);
cur = cur + iosize;
pg_offset += iosize;
}
@@ -1113,16 +1082,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int btrfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ const u64 start = folio_pos(folio);
+ const u64 end = start + folio_size(folio) - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct extent_map *em_cached = NULL;
int ret;
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
+
+ free_extent_map(em_cached);
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
@@ -1131,60 +1104,222 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static inline void contiguous_readpages(struct page *pages[], int nr_pages,
- u64 start, u64 end,
- struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- u64 *prev_em_start)
+static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
+ u64 start, u32 len)
{
- struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
- int index;
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ unsigned int start_bit;
+ unsigned int nbits;
+
+ ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ nbits = len >> fs_info->sectorsize_bits;
+ ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
+ bitmap_set(delalloc_bitmap, start_bit, nbits);
+}
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+static bool find_next_delalloc_bitmap(struct folio *folio,
+ unsigned long *delalloc_bitmap, u64 start,
+ u64 *found_start, u32 *found_len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ const unsigned int bitmap_size = fs_info->sectors_per_page;
+ unsigned int start_bit;
+ unsigned int first_zero;
+ unsigned int first_set;
+
+ ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
+ if (first_set >= bitmap_size)
+ return false;
- for (index = 0; index < nr_pages; index++) {
- btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
- prev_em_start);
- put_page(pages[index]);
- }
+ *found_start = folio_start + (first_set << fs_info->sectorsize_bits);
+ first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
+ *found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
+ return true;
}
/*
- * helper for __extent_writepage, doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
- struct page *page, struct writeback_control *wbc)
+ struct folio *folio,
+ struct btrfs_bio_ctrl *bio_ctrl)
{
- const u64 page_start = page_offset(page);
- const u64 page_end = page_start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
+ struct writeback_control *wbc = bio_ctrl->wbc;
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const u64 page_start = folio_pos(folio);
+ const u64 page_end = page_start + folio_size(folio) - 1;
+ unsigned long delalloc_bitmap = 0;
+ /*
+ * Save the last found delalloc end. As the delalloc end can go beyond
+ * page boundary, thus we cannot rely on subpage bitmap to locate the
+ * last delalloc end.
+ */
+ u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ int bit;
+ /* Save the dirty bitmap as our submission bitmap will be a subset of it. */
+ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
+ ASSERT(fs_info->sectors_per_page > 1);
+ btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
+ } else {
+ bio_ctrl->submit_bitmap = 1;
+ }
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ u64 start = page_start + (bit << fs_info->sectorsize_bits);
+
+ btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
+ }
+
+ /* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
- if (!find_lock_delalloc_range(&inode->vfs_inode, page,
+ if (!find_lock_delalloc_range(&inode->vfs_inode, folio,
&delalloc_start, &delalloc_end)) {
delalloc_start = delalloc_end + 1;
continue;
}
-
- ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, wbc);
- if (ret < 0)
- return ret;
-
+ set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+ min(delalloc_end, page_end) + 1 - delalloc_start);
+ last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
+ delalloc_start = page_start;
+ if (!last_delalloc_end)
+ goto out;
+
+ /* Run the delalloc ranges for the above locked ranges. */
+ while (delalloc_start < page_end) {
+ u64 found_start;
+ u32 found_len;
+ bool found;
+
+ if (!is_subpage) {
+ /*
+ * For non-subpage case, the found delalloc range must
+ * cover this folio and there must be only one locked
+ * delalloc range.
+ */
+ found_start = page_start;
+ found_len = last_delalloc_end + 1 - found_start;
+ found = true;
+ } else {
+ found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
+ delalloc_start, &found_start, &found_len);
+ }
+ if (!found)
+ break;
+ /*
+ * The subpage range covers the last sector, the delalloc range may
+ * end beyond the folio boundary, use the saved delalloc_end
+ * instead.
+ */
+ if (found_start + found_len >= page_end)
+ found_len = last_delalloc_end + 1 - found_start;
+
+ if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
+ /* No errors hit so far, run the current delalloc range. */
+ ret = btrfs_run_delalloc_range(inode, folio,
+ found_start,
+ found_start + found_len - 1,
+ wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
+ } else {
+ /*
+ * We've hit an error during previous delalloc range,
+ * have to cleanup the remaining locked ranges.
+ */
+ unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
+ __unlock_for_delalloc(&inode->vfs_inode, folio,
+ found_start,
+ found_start + found_len - 1);
+ }
+
+ /*
+ * We have some ranges that's going to be submitted asynchronously
+ * (compression or inline). These range have their own control
+ * on when to unlock the pages. We should not touch them
+ * anymore, so clear the range from the submission bitmap.
+ */
+ if (ret > 0) {
+ unsigned int start_bit = (found_start - page_start) >>
+ fs_info->sectorsize_bits;
+ unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
+ page_start) >> fs_info->sectorsize_bits;
+ bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
+ }
+ /*
+ * Above btrfs_run_delalloc_range() may have unlocked the folio,
+ * thus for the last range, we cannot touch the folio anymore.
+ */
+ if (found_start + found_len >= last_delalloc_end + 1)
+ break;
+
+ delalloc_start = found_start + found_len;
+ }
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
+ return ret;
+ }
+out:
+ if (last_delalloc_end)
+ delalloc_end = last_delalloc_end;
+ else
+ delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE
@@ -1193,10 +1328,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
/*
- * If btrfs_run_dealloc_range() already started I/O and unlocked
- * the pages, we just need to account for them here.
+ * If all ranges are submitted asynchronously, we just need to account
+ * for them here.
*/
- if (ret == 1) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1214,178 +1349,165 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
}
/*
- * Find the first byte we need to write.
- *
- * For subpage, one page can contain several sectors, and
- * __extent_writepage_io() will just grab all extent maps in the page
- * range and try to submit all non-inline/non-compressed extents.
+ * Return 0 if we have submitted or queued the sector for submission.
+ * Return <0 for critical errors.
*
- * This is a big problem for subpage, we shouldn't re-submit already written
- * data at all.
- * This function will lookup subpage dirty bit to find which range we really
- * need to submit.
- *
- * Return the next dirty range in [@start, @end).
- * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ * Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
-static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
- struct page *page, u64 *start, u64 *end)
+static int submit_one_sector(struct btrfs_inode *inode,
+ struct folio *folio,
+ u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
+ loff_t i_size)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- struct btrfs_subpage_info *spi = fs_info->subpage_info;
- u64 orig_start = *start;
- /* Declare as unsigned long so we can use bitmap ops */
- unsigned long flags;
- int range_start_bit;
- int range_end_bit;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map *em;
+ u64 block_start;
+ u64 disk_bytenr;
+ u64 extent_offset;
+ u64 em_end;
+ const u32 sectorsize = fs_info->sectorsize;
- /*
- * For regular sector size == page size case, since one page only
- * contains one sector, we return the page offset directly.
- */
- if (!btrfs_is_subpage(fs_info, page)) {
- *start = page_offset(page);
- *end = page_offset(page) + PAGE_SIZE;
- return;
- }
+ ASSERT(IS_ALIGNED(filepos, sectorsize));
+
+ /* @filepos >= i_size case should be handled by the caller. */
+ ASSERT(filepos < i_size);
- range_start_bit = spi->dirty_offset +
- (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+ em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR_OR_ZERO(em);
- /* We should have the page locked, but just in case */
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
- spi->dirty_offset + spi->bitmap_nr_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ extent_offset = filepos - em->start;
+ em_end = extent_map_end(em);
+ ASSERT(filepos <= em_end);
+ ASSERT(IS_ALIGNED(em->start, sectorsize));
+ ASSERT(IS_ALIGNED(em->len, sectorsize));
- range_start_bit -= spi->dirty_offset;
- range_end_bit -= spi->dirty_offset;
+ block_start = extent_map_block_start(em);
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
- *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
- *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
+ ASSERT(!extent_map_is_compressed(em));
+ ASSERT(block_start != EXTENT_MAP_HOLE);
+ ASSERT(block_start != EXTENT_MAP_INLINE);
+
+ free_extent_map(em);
+ em = NULL;
+
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * a folio for a range already written to disk.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+ /*
+ * Above call should set the whole folio with writeback flag, even
+ * just for a single subpage sector.
+ * As long as the folio is properly locked and the range is correct,
+ * we should always get the folio with writeback flag.
+ */
+ ASSERT(folio_test_writeback(folio));
+
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio,
+ sectorsize, filepos - folio_pos(folio));
+ return 0;
}
/*
- * helper for __extent_writepage. This calls the writepage start hooks,
+ * Helper for extent_writepage(). This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
* We return 1 if the IO is started and the page is unlocked,
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
- struct page *page,
- struct btrfs_bio_ctrl *bio_ctrl,
- loff_t i_size,
- int *nr_ret)
+static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
+ struct folio *folio,
+ u64 start, u32 len,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ loff_t i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 cur = page_offset(page);
- u64 end = cur + PAGE_SIZE - 1;
- u64 extent_offset;
- u64 block_start;
- struct extent_map *em;
+ unsigned long range_bitmap = 0;
+ bool submitted_io = false;
+ bool error = false;
+ const u64 folio_start = folio_pos(folio);
+ u64 cur;
+ int bit;
int ret = 0;
- int nr = 0;
- ret = btrfs_writepage_cow_fixup(page);
+ ASSERT(start >= folio_start &&
+ start + len <= folio_start + folio_size(folio));
+
+ ret = btrfs_writepage_cow_fixup(folio);
if (ret) {
/* Fixup worker will requeue */
- redirty_page_for_writepage(bio_ctrl->wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(bio_ctrl->wbc, folio);
+ folio_unlock(folio);
return 1;
}
- bio_ctrl->end_io_func = end_bio_extent_writepage;
- while (cur <= end) {
- u32 len = end - cur + 1;
- u64 disk_bytenr;
- u64 em_end;
- u64 dirty_range_start = cur;
- u64 dirty_range_end;
- u32 iosize;
+ for (cur = start; cur < start + len; cur += fs_info->sectorsize)
+ set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
+ bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
+ fs_info->sectors_per_page);
+
+ bio_ctrl->end_io_func = end_bbio_data_write;
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
- btrfs_mark_ordered_io_finished(inode, page, cur, len,
- true);
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ start + len - cur, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
* But we still need to clear the dirty subpage bit, or
- * the next time the page gets dirtied, we will try to
+ * the next time the folio gets dirtied, we will try to
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, len);
+ btrfs_folio_clear_dirty(fs_info, folio, cur,
+ start + len - cur);
break;
}
-
- find_next_dirty_byte(fs_info, page, &dirty_range_start,
- &dirty_range_end);
- if (cur < dirty_range_start) {
- cur = dirty_range_start;
+ ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
continue;
}
-
- em = btrfs_get_extent(inode, NULL, 0, cur, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR_OR_ZERO(em);
- goto out_error;
- }
-
- extent_offset = cur - em->start;
- em_end = extent_map_end(em);
- ASSERT(cur <= em_end);
- ASSERT(cur < end);
- ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
- ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
-
- block_start = em->block_start;
- disk_bytenr = em->block_start + extent_offset;
-
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- ASSERT(block_start != EXTENT_MAP_HOLE);
- ASSERT(block_start != EXTENT_MAP_INLINE);
-
- /*
- * Note that em_end from extent_map_end() and dirty_range_end from
- * find_next_dirty_byte() are all exclusive
- */
- iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
- free_extent_map(em);
- em = NULL;
-
- btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
- if (!PageWriteback(page)) {
- btrfs_err(inode->root->fs_info,
- "page %lu not writeback, cur %llu end %llu",
- page->index, cur, end);
- }
-
- /*
- * Although the PageDirty bit is cleared before entering this
- * function, subpage dirty bit is not cleared.
- * So clear subpage dirty bit here so next time we won't submit
- * page for range already written to disk.
- */
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
-
- submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
- cur - page_offset(page));
- cur += iosize;
- nr++;
+ submitted_io = true;
}
- btrfs_page_assert_not_dirty(fs_info, page);
- *nr_ret = nr;
- return 0;
-
-out_error:
/*
- * If we finish without problem, we should not only clear page dirty,
- * but also empty subpage dirty bits
+ * If we didn't submitted any sector (>= i_size), folio dirty get
+ * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
+ * by folio_start_writeback() if the folio is not dirty).
+ *
+ * Here we set writeback and clear for the range. If the full folio
+ * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- *nr_ret = nr;
+ if (!submitted_io && !error) {
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
+ }
return ret;
}
@@ -1398,60 +1520,60 @@ out_error:
* Return 0 if everything goes well.
* Return <0 for error.
*/
-static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
+static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
- const u64 page_start = page_offset(page);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
- int nr = 0;
size_t pg_offset;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read(&inode->vfs_inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- trace___extent_writepage(page, inode, bio_ctrl->wbc);
+ trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
- WARN_ON(!PageLocked(page));
+ WARN_ON(!folio_test_locked(folio));
- pg_offset = offset_in_page(i_size);
- if (page->index > end_index ||
- (page->index == end_index && !pg_offset)) {
+ pg_offset = offset_in_folio(folio, i_size);
+ if (folio->index > end_index ||
+ (folio->index == end_index && !pg_offset)) {
folio_invalidate(folio, 0, folio_size(folio));
folio_unlock(folio);
return 0;
}
- if (page->index == end_index)
- memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
+ if (folio->index == end_index)
+ folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
- ret = set_page_extent_mapped(page);
+ /*
+ * Default to unlock the whole folio.
+ * The proper bitmap can only be initialized until writepage_delalloc().
+ */
+ bio_ctrl->submit_bitmap = (unsigned long)-1;
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
- ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
+ ret = writepage_delalloc(inode, folio, bio_ctrl);
if (ret == 1)
return 0;
if (ret)
goto done;
- ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
+ ret = extent_writepage_io(inode, folio, folio_pos(folio),
+ PAGE_SIZE, bio_ctrl, i_size);
if (ret == 1)
return 0;
bio_ctrl->wbc->nr_to_write--;
done:
- if (nr == 0) {
- /* make sure the mapping tag for page dirty gets cleared */
- set_page_writeback(page);
- end_page_writeback(page);
- }
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
- PAGE_SIZE, !ret);
- mapping_set_error(page->mapping, ret);
- }
- unlock_page(page);
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
+ /*
+ * Only unlock ranges that are submitted. As there can be some async
+ * submitted ranges inside the folio.
+ */
+ btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}
@@ -1537,7 +1659,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
- * able to find the pages tagged with SetPageError at transaction
+ * able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
@@ -1583,7 +1705,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- struct btrfs_fs_info *fs_info, u64 start)
+ const struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
@@ -1598,24 +1720,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
if (!uptodate)
set_btree_ioerr(eb);
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ struct folio *folio = fi.folio;
+ u32 len = fi.length;
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
bio_offset += len;
}
@@ -1664,39 +1785,46 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, extent_buffer_write_end_io, eb);
+ eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
if (fs_info->nodesize < PAGE_SIZE) {
- struct page *p = eb->pages[0];
+ struct folio *folio = eb->folios[0];
+ bool ret;
- lock_page(p);
- btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ folio_lock(folio);
+ btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
eb->len)) {
- clear_page_dirty_for_io(p);
+ folio_clear_dirty_for_io(folio);
wbc->nr_to_write--;
}
- __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
- wbc_account_cgroup_owner(wbc, p, eb->len);
- unlock_page(p);
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio, eb->len);
+ folio_unlock(folio);
} else {
- for (int i = 0; i < num_extent_pages(eb); i++) {
- struct page *p = eb->pages[i];
-
- lock_page(p);
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
- wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
- wbc->nr_to_write--;
- unlock_page(p);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ bool ret;
+
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
}
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
}
/*
@@ -1713,17 +1841,17 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
-static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
+static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
int submitted = 0;
- u64 page_start = page_offset(page);
+ u64 folio_start = folio_pos(folio);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ while (bit_start < fs_info->sectors_per_page) {
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
u64 start;
@@ -1732,21 +1860,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&folio->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
- bit_start++;
+ spin_unlock(&folio->mapping->i_private_lock);
+ bit_start += sectors_per_node;
continue;
}
- start = page_start + bit_start * fs_info->sectorsize;
+ start = folio_start + bit_start * fs_info->sectorsize;
bit_start += sectors_per_node;
/*
@@ -1755,7 +1883,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1794,42 +1922,42 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* previous call.
* Return <0 for fatal error.
*/
-static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
+static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct extent_buffer *eb;
int ret;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return 0;
- if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, wbc);
+ if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ return submit_eb_subpage(folio, wbc);
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
if (eb == ctx->eb) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (!ret)
return 0;
@@ -1862,7 +1990,7 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct btrfs_eb_write_context ctx = { .wbc = wbc };
- struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -1903,7 +2031,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- ret = submit_eb_page(&folio->page, &ctx);
+ ret = submit_eb_page(folio, &ctx);
if (ret == 0)
continue;
if (ret < 0) {
@@ -1957,7 +2085,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*
- * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * We can get ret > 0 from submit_extent_folio() indicating how many ebs
* were submitted. Reset it to 0 to avoid false alerts for the caller.
*/
if (ret > 0)
@@ -2096,7 +2224,7 @@ retry:
continue;
}
- ret = __extent_writepage(&folio->page, bio_ctrl);
+ ret = extent_writepage(folio, bio_ctrl);
if (ret < 0) {
done = 1;
break;
@@ -2143,14 +2271,14 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
bool found_error = false;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
const u32 sectorsize = fs_info->sectorsize;
loff_t i_size = i_size_read(inode);
u64 cur = start;
@@ -2167,42 +2295,50 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
while (cur <= end) {
u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
u32 cur_len = cur_end + 1 - cur;
- struct page *page;
- int nr = 0;
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
+
+ /*
+ * This shouldn't happen, the pages are pinned and locked, this
+ * code is just in case, but shouldn't actually be run.
+ */
+ if (IS_ERR(folio)) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ cur, cur_len, false);
+ mapping_set_error(mapping, PTR_ERR(folio));
+ cur = cur_end + 1;
+ continue;
+ }
- page = find_get_page(mapping, cur >> PAGE_SHIFT);
- ASSERT(PageLocked(page));
- if (pages_dirty && page != locked_page)
- ASSERT(PageDirty(page));
+ ASSERT(folio_test_locked(folio));
+ if (pages_dirty && folio != locked_folio)
+ ASSERT(folio_test_dirty(folio));
- ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
- i_size, &nr);
+ /*
+ * Set the submission bitmap to submit all sectors.
+ * extent_writepage_io() will do the truncation correctly.
+ */
+ bio_ctrl.submit_bitmap = (unsigned long)-1;
+ ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
+ &bio_ctrl, i_size);
if (ret == 1)
goto next_page;
- /* Make sure the mapping tag for page dirty gets cleared. */
- if (nr == 0) {
- set_page_writeback(page);
- end_page_writeback(page);
- }
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
- cur, cur_len, !ret);
- mapping_set_error(page->mapping, ret);
- }
- btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ if (ret)
+ mapping_set_error(mapping, ret);
+ btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
- put_page(page);
+ folio_put(folio);
cur = cur_end + 1;
}
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -2222,21 +2358,23 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-void extent_readahead(struct readahead_control *rac)
+void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
- struct page *pagepool[16];
+ struct folio *folio;
+ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ const u64 start = readahead_pos(rac);
+ const u64 end = start + readahead_length(rac) - 1;
+ struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
- int nr;
- while ((nr = readahead_page_batch(rac, pagepool))) {
- u64 contig_start = readahead_pos(rac);
- u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
+ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
- contiguous_readpages(pagepool, nr, contig_start, contig_end,
- &em_cached, &bio_ctrl, &prev_em_start);
- }
+ while ((folio = readahead_folio(rac)) != NULL)
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
if (em_cached)
free_extent_map(em_cached);
@@ -2254,7 +2392,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1;
- size_t blocksize = btrfs_sb(folio->mapping->host->i_sb)->sectorsize;
+ size_t blocksize = folio_to_fs_info(folio)->sectorsize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -2280,19 +2418,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_io_tree *tree,
- struct page *page, gfp_t mask)
+static bool try_release_extent_state(struct extent_io_tree *tree,
+ struct folio *folio, gfp_t mask)
{
- u64 start = page_offset(page);
+ u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
- int ret = 1;
+ bool ret;
- if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
- ret = 0;
+ if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
+ ret = false;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
EXTENT_QGROUP_RESERVED);
+ int ret2;
/*
* At this point we can safely clear everything except the
@@ -2300,15 +2439,15 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
- if (ret < 0)
- ret = 0;
+ if (ret2 < 0)
+ ret = false;
else
- ret = 1;
+ ret = true;
}
return ret;
}
@@ -2318,933 +2457,80 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
- struct extent_map *em;
- u64 start = page_offset(page);
+ u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
- struct extent_io_tree *tree = &btrfs_inode->io_tree;
- struct extent_map_tree *map = &btrfs_inode->extent_tree;
-
- if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > SZ_16M) {
- u64 len;
- while (start <= end) {
- struct btrfs_fs_info *fs_info;
- u64 cur_gen;
-
- len = end - start + 1;
- write_lock(&map->lock);
- em = lookup_extent_mapping(map, start, len);
- if (!em) {
- write_unlock(&map->lock);
- break;
- }
- if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
- em->start != start) {
- write_unlock(&map->lock);
- free_extent_map(em);
- break;
- }
- if (test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL))
- goto next;
- /*
- * If it's not in the list of modified extents, used
- * by a fast fsync, we can remove it. If it's being
- * logged we can safely remove it since fsync took an
- * extra reference on the em.
- */
- if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- goto remove_em;
- /*
- * If it's in the list of modified extents, remove it
- * only if its generation is older then the current one,
- * in which case we don't need it for a fast fsync.
- * Otherwise don't remove it, we could be racing with an
- * ongoing fast fsync that could miss the new extent.
- */
- fs_info = btrfs_inode->root->fs_info;
- spin_lock(&fs_info->trans_lock);
- cur_gen = fs_info->generation;
- spin_unlock(&fs_info->trans_lock);
- if (em->generation >= cur_gen)
- goto next;
-remove_em:
- /*
- * We only remove extent maps that are not in the list of
- * modified extents or that are in the list but with a
- * generation lower then the current generation, so there
- * is no need to set the full fsync flag on the inode (it
- * hurts the fsync performance for workloads with a data
- * size that exceeds or is close to the system's memory).
- */
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
-next:
- start = extent_map_end(em);
- write_unlock(&map->lock);
-
- /* once for us */
- free_extent_map(em);
-
- cond_resched(); /* Allow large-extent preemption. */
- }
- }
- return try_release_extent_state(tree, page, mask);
-}
-
-struct btrfs_fiemap_entry {
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
-};
-
-/*
- * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
- * range from the inode's io tree, unlock the subvolume tree search path, flush
- * the fiemap cache and relock the file range and research the subvolume tree.
- * The value here is something negative that can't be confused with a valid
- * errno value and different from 1 because that's also a return value from
- * fiemap_fill_next_extent() and also it's often used to mean some btree search
- * did not find a key, so make it some distinct negative value.
- */
-#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
-
-/*
- * Used to:
- *
- * - Cache the next entry to be emitted to the fiemap buffer, so that we can
- * merge extents that are contiguous and can be grouped as a single one;
- *
- * - Store extents ready to be written to the fiemap buffer in an intermediary
- * buffer. This intermediary buffer is to ensure that in case the fiemap
- * buffer is memory mapped to the fiemap target file, we don't deadlock
- * during btrfs_page_mkwrite(). This is because during fiemap we are locking
- * an extent range in order to prevent races with delalloc flushing and
- * ordered extent completion, which is needed in order to reliably detect
- * delalloc in holes and prealloc extents. And this can lead to a deadlock
- * if the fiemap buffer is memory mapped to the file we are running fiemap
- * against (a silly, useless in practice scenario, but possible) because
- * btrfs_page_mkwrite() will try to lock the same extent range.
- */
-struct fiemap_cache {
- /* An array of ready fiemap entries. */
- struct btrfs_fiemap_entry *entries;
- /* Number of entries in the entries array. */
- int entries_size;
- /* Index of the next entry in the entries array to write to. */
- int entries_pos;
- /*
- * Once the entries array is full, this indicates what's the offset for
- * the next file extent item we must search for in the inode's subvolume
- * tree after unlocking the extent range in the inode's io tree and
- * releasing the search path.
- */
- u64 next_search_offset;
- /*
- * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
- * to count ourselves emitted extents and stop instead of relying on
- * fiemap_fill_next_extent() because we buffer ready fiemap entries at
- * the @entries array, and we want to stop as soon as we hit the max
- * amount of extents to map, not just to save time but also to make the
- * logic at extent_fiemap() simpler.
- */
- unsigned int extents_mapped;
- /* Fields for the cached extent (unsubmitted, not ready, extent). */
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
- bool cached;
-};
-
-static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- for (int i = 0; i < cache->entries_pos; i++) {
- struct btrfs_fiemap_entry *entry = &cache->entries[i];
- int ret;
-
- ret = fiemap_fill_next_extent(fieinfo, entry->offset,
- entry->phys, entry->len,
- entry->flags);
- /*
- * Ignore 1 (reached max entries) because we keep track of that
- * ourselves in emit_fiemap_extent().
- */
- if (ret < 0)
- return ret;
- }
- cache->entries_pos = 0;
-
- return 0;
-}
-
-/*
- * Helper to submit fiemap extent.
- *
- * Will try to merge current fiemap extent specified by @offset, @phys,
- * @len and @flags with cached one.
- * And only when we fails to merge, cached one will be submitted as
- * fiemap extent.
- *
- * Return value is the same as fiemap_fill_next_extent().
- */
-static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- u64 offset, u64 phys, u64 len, u32 flags)
-{
- struct btrfs_fiemap_entry *entry;
- u64 cache_end;
-
- /* Set at the end of extent_fiemap(). */
- ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
-
- if (!cache->cached)
- goto assign;
-
- /*
- * When iterating the extents of the inode, at extent_fiemap(), we may
- * find an extent that starts at an offset behind the end offset of the
- * previous extent we processed. This happens if fiemap is called
- * without FIEMAP_FLAG_SYNC and there are ordered extents completing
- * after we had to unlock the file range, release the search path, emit
- * the fiemap extents stored in the buffer (cache->entries array) and
- * the lock the remainder of the range and re-search the btree.
- *
- * For example we are in leaf X processing its last item, which is the
- * file extent item for file range [512K, 1M[, and after
- * btrfs_next_leaf() releases the path, there's an ordered extent that
- * completes for the file range [768K, 2M[, and that results in trimming
- * the file extent item so that it now corresponds to the file range
- * [512K, 768K[ and a new file extent item is inserted for the file
- * range [768K, 2M[, which may end up as the last item of leaf X or as
- * the first item of the next leaf - in either case btrfs_next_leaf()
- * will leave us with a path pointing to the new extent item, for the
- * file range [768K, 2M[, since that's the first key that follows the
- * last one we processed. So in order not to report overlapping extents
- * to user space, we trim the length of the previously cached extent and
- * emit it.
- *
- * Upon calling btrfs_next_leaf() we may also find an extent with an
- * offset smaller than or equals to cache->offset, and this happens
- * when we had a hole or prealloc extent with several delalloc ranges in
- * it, but after btrfs_next_leaf() released the path, delalloc was
- * flushed and the resulting ordered extents were completed, so we can
- * now have found a file extent item for an offset that is smaller than
- * or equals to what we have in cache->offset. We deal with this as
- * described below.
- */
- cache_end = cache->offset + cache->len;
- if (cache_end > offset) {
- if (offset == cache->offset) {
- /*
- * We cached a dealloc range (found in the io tree) for
- * a hole or prealloc extent and we have now found a
- * file extent item for the same offset. What we have
- * now is more recent and up to date, so discard what
- * we had in the cache and use what we have just found.
- */
- goto assign;
- } else if (offset > cache->offset) {
- /*
- * The extent range we previously found ends after the
- * offset of the file extent item we found and that
- * offset falls somewhere in the middle of that previous
- * extent range. So adjust the range we previously found
- * to end at the offset of the file extent item we have
- * just found, since this extent is more up to date.
- * Emit that adjusted range and cache the file extent
- * item we have just found. This corresponds to the case
- * where a previously found file extent item was split
- * due to an ordered extent completing.
- */
- cache->len = offset - cache->offset;
- goto emit;
- } else {
- const u64 range_end = offset + len;
-
- /*
- * The offset of the file extent item we have just found
- * is behind the cached offset. This means we were
- * processing a hole or prealloc extent for which we
- * have found delalloc ranges (in the io tree), so what
- * we have in the cache is the last delalloc range we
- * found while the file extent item we found can be
- * either for a whole delalloc range we previously
- * emmitted or only a part of that range.
- *
- * We have two cases here:
- *
- * 1) The file extent item's range ends at or behind the
- * cached extent's end. In this case just ignore the
- * current file extent item because we don't want to
- * overlap with previous ranges that may have been
- * emmitted already;
- *
- * 2) The file extent item starts behind the currently
- * cached extent but its end offset goes beyond the
- * end offset of the cached extent. We don't want to
- * overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
- * extent and then partially store the current file
- * extent item's range in the cache, for the subrange
- * going the cached extent's end to the end of the
- * file extent item.
- */
- if (range_end <= cache_end)
- return 0;
-
- if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
- phys += cache_end - offset;
-
- offset = cache_end;
- len = range_end - cache_end;
- goto emit;
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+
+ while (start <= end) {
+ const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ const u64 len = end - start + 1;
+ struct extent_map_tree *extent_tree = &inode->extent_tree;
+ struct extent_map *em;
+
+ write_lock(&extent_tree->lock);
+ em = lookup_extent_mapping(extent_tree, start, len);
+ if (!em) {
+ write_unlock(&extent_tree->lock);
+ break;
}
- }
-
- /*
- * Only merges fiemap extents if
- * 1) Their logical addresses are continuous
- *
- * 2) Their physical addresses are continuous
- * So truly compressed (physical size smaller than logical size)
- * extents won't get merged with each other
- *
- * 3) Share same flags
- */
- if (cache->offset + cache->len == offset &&
- cache->phys + cache->len == phys &&
- cache->flags == flags) {
- cache->len += len;
- return 0;
- }
-
-emit:
- /* Not mergeable, need to submit cached one */
-
- if (cache->entries_pos == cache->entries_size) {
- /*
- * We will need to research for the end offset of the last
- * stored extent and not from the current offset, because after
- * unlocking the range and releasing the path, if there's a hole
- * between that end offset and this current offset, a new extent
- * may have been inserted due to a new write, so we don't want
- * to miss it.
- */
- entry = &cache->entries[cache->entries_size - 1];
- cache->next_search_offset = entry->offset + entry->len;
- cache->cached = false;
-
- return BTRFS_FIEMAP_FLUSH_CACHE;
- }
-
- entry = &cache->entries[cache->entries_pos];
- entry->offset = cache->offset;
- entry->phys = cache->phys;
- entry->len = cache->len;
- entry->flags = cache->flags;
- cache->entries_pos++;
- cache->extents_mapped++;
-
- if (cache->extents_mapped == fieinfo->fi_extents_max) {
- cache->cached = false;
- return 1;
- }
-assign:
- cache->cached = true;
- cache->offset = offset;
- cache->phys = phys;
- cache->len = len;
- cache->flags = flags;
-
- return 0;
-}
-
-/*
- * Emit last fiemap cache
- *
- * The last fiemap cache may still be cached in the following case:
- * 0 4k 8k
- * |<- Fiemap range ->|
- * |<------------ First extent ----------->|
- *
- * In this case, the first extent range will be cached but not emitted.
- * So we must emit it before ending extent_fiemap().
- */
-static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- int ret;
-
- if (!cache->cached)
- return 0;
-
- ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
- cache->len, cache->flags);
- cache->cached = false;
- if (ret > 0)
- ret = 0;
- return ret;
-}
-
-static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
-{
- struct extent_buffer *clone;
- struct btrfs_key key;
- int slot;
- int ret;
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
- return 0;
-
- ret = btrfs_next_leaf(inode->root, path);
- if (ret != 0)
- return ret;
-
- /*
- * Don't bother with cloning if there are no more file extent items for
- * our inode.
- */
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
-
- /* See the comment at fiemap_search_slot() about why we clone. */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-
- return 0;
-}
-
-/*
- * Search for the first file extent item that starts at a given file offset or
- * the one that starts immediately before that offset.
- * Returns: 0 on success, < 0 on error, 1 if not found.
- */
-static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
- u64 file_offset)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *clone;
- struct btrfs_key key;
- int slot;
- int ret;
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = file_offset;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- if (ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret != 0)
- return ret;
-
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
- }
-
- /*
- * We clone the leaf and use it during fiemap. This is because while
- * using the leaf we do expensive things like checking if an extent is
- * shared, which can take a long time. In order to prevent blocking
- * other tasks for too long, we use a clone of the leaf. We have locked
- * the file range in the inode's io tree, so we know none of our file
- * extent items can change. This way we avoid blocking other tasks that
- * want to insert items for other inodes in the same leaf or b+tree
- * rebalance operations (triggered for example when someone is trying
- * to push items into this leaf when trying to insert an item in a
- * neighbour leaf).
- * We also need the private clone because holding a read lock on an
- * extent buffer of the subvolume's b+tree will make lockdep unhappy
- * when we check if extents are shared, as backref walking may need to
- * lock the same leaf we are processing.
- */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-
- return 0;
-}
-
-/*
- * Process a range which is a hole or a prealloc extent in the inode's subvolume
- * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
- * extent. The end offset (@end) is inclusive.
- */
-static int fiemap_process_hole(struct btrfs_inode *inode,
- struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- struct extent_state **delalloc_cached_state,
- struct btrfs_backref_share_check_ctx *backref_ctx,
- u64 disk_bytenr, u64 extent_offset,
- u64 extent_gen,
- u64 start, u64 end)
-{
- const u64 i_size = i_size_read(&inode->vfs_inode);
- u64 cur_offset = start;
- u64 last_delalloc_end = 0;
- u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
- bool checked_extent_shared = false;
- int ret;
-
- /*
- * There can be no delalloc past i_size, so don't waste time looking for
- * it beyond i_size.
- */
- while (cur_offset < end && cur_offset < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- u64 prealloc_start;
- u64 prealloc_len = 0;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
- delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
+ if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
+ write_unlock(&extent_tree->lock);
+ free_extent_map(em);
break;
-
+ }
+ if (test_range_bit_exists(io_tree, em->start,
+ extent_map_end(em) - 1, EXTENT_LOCKED))
+ goto next;
/*
- * If this is a prealloc extent we have to report every section
- * of it that has no delalloc.
+ * If it's not in the list of modified extents, used by a fast
+ * fsync, we can remove it. If it's being logged we can safely
+ * remove it since fsync took an extra reference on the em.
*/
- if (disk_bytenr != 0) {
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = delalloc_start - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = delalloc_start - prealloc_start;
- }
- }
-
- if (prealloc_len > 0) {
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
-
- checked_extent_shared = true;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- extent_offset += prealloc_len;
- }
-
- ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
- delalloc_end + 1 - delalloc_start,
- FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- if (ret)
- return ret;
-
- last_delalloc_end = delalloc_end;
- cur_offset = delalloc_end + 1;
- extent_offset += cur_offset - delalloc_start;
- cond_resched();
- }
-
- /*
- * Either we found no delalloc for the whole prealloc extent or we have
- * a prealloc extent that spans i_size or starts at or after i_size.
- */
- if (disk_bytenr != 0 && last_delalloc_end < end) {
- u64 prealloc_start;
- u64 prealloc_len;
-
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = end + 1 - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = end + 1 - prealloc_start;
- }
-
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
- struct btrfs_path *path,
- u64 *last_extent_end_ret)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 disk_bytenr;
- int ret;
-
- /*
- * Lookup the last file extent. We're not using i_size here because
- * there might be preallocation past i_size.
- */
- ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
- /* There can't be a file extent item at offset (u64)-1 */
- ASSERT(ret != 0);
- if (ret < 0)
- return ret;
-
- /*
- * For a non-existing key, btrfs_search_slot() always leaves us at a
- * slot > 0, except if the btree is empty, which is impossible because
- * at least it has the inode item for this inode and all the items for
- * the root inode 256.
- */
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* No file extent items in the subvolume tree. */
- *last_extent_end_ret = 0;
- return 0;
- }
-
- /*
- * For an inline extent, the disk_bytenr is where inline data starts at,
- * so first check if we have an inline extent item before checking if we
- * have an implicit hole (disk_bytenr == 0).
- */
- ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
- }
-
- /*
- * Find the last file extent item that is not a hole (when NO_HOLES is
- * not enabled). This should take at most 2 iterations in the worst
- * case: we have one hole file extent item at slot 0 of a leaf and
- * another hole file extent item as the last item in the previous leaf.
- * This is because we merge file extent items that represent holes.
- */
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- while (disk_bytenr == 0) {
- ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- /* No file extent items that are not holes. */
- *last_extent_end_ret = 0;
- return 0;
- }
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- }
-
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
-}
-
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
- struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
- struct fiemap_cache cache = { 0 };
- struct btrfs_backref_share_check_ctx *backref_ctx;
- u64 last_extent_end;
- u64 prev_extent_end;
- u64 range_start;
- u64 range_end;
- const u64 sectorsize = inode->root->fs_info->sectorsize;
- bool stopped = false;
- int ret;
-
- cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
- cache.entries = kmalloc_array(cache.entries_size,
- sizeof(struct btrfs_fiemap_entry),
- GFP_KERNEL);
- backref_ctx = btrfs_alloc_backref_share_check_ctx();
- path = btrfs_alloc_path();
- if (!cache.entries || !backref_ctx || !path) {
- ret = -ENOMEM;
- goto out;
- }
-
-restart:
- range_start = round_down(start, sectorsize);
- range_end = round_up(start + len, sectorsize);
- prev_extent_end = range_start;
-
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
- if (ret < 0)
- goto out_unlock;
- btrfs_release_path(path);
-
- path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, range_start);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
+ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
+ goto remove_em;
/*
- * No file extent item found, but we may have delalloc between
- * the current offset and i_size. So check for that.
+ * If it's in the list of modified extents, remove it only if
+ * its generation is older then the current one, in which case
+ * we don't need it for a fast fsync. Otherwise don't remove it,
+ * we could be racing with an ongoing fast fsync that could miss
+ * the new extent.
*/
- ret = 0;
- goto check_eof_delalloc;
- }
-
- while (prev_extent_end < range_end) {
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 extent_end;
- u64 extent_len;
- u64 extent_offset = 0;
- u64 extent_gen;
- u64 disk_bytenr = 0;
- u64 flags = 0;
- int extent_type;
- u8 compression;
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- break;
-
- extent_end = btrfs_file_extent_end(path);
-
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
/*
- * The first iteration can leave us at an extent item that ends
- * before our range's start. Move to the next item.
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there is no
+ * need to set the full fsync flag on the inode (it hurts the
+ * fsync performance for workloads with a data size that exceeds
+ * or is close to the system's memory).
*/
- if (extent_end <= range_start)
- goto next_item;
-
- backref_ctx->curr_leaf_bytenr = leaf->start;
-
- /* We have in implicit hole (NO_HOLES feature enabled). */
- if (prev_extent_end < key.offset) {
- const u64 hole_end = min(key.offset, range_end) - 1;
+ remove_extent_mapping(inode, em);
+ /* Once for the inode's extent map tree. */
+ free_extent_map(em);
+next:
+ start = extent_map_end(em);
+ write_unlock(&extent_tree->lock);
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- prev_extent_end, hole_end);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* fiemap_fill_next_extent() told us to stop. */
- stopped = true;
- break;
- }
+ /* Once for us, for the lookup_extent_mapping() reference. */
+ free_extent_map(em);
- /* We've reached the end of the fiemap range, stop. */
- if (key.offset >= range_end) {
- stopped = true;
+ if (need_resched()) {
+ /*
+ * If we need to resched but we can't block just exit
+ * and leave any remaining extent maps.
+ */
+ if (!gfpflags_allow_blocking(mask))
break;
- }
- }
-
- extent_len = extent_end - key.offset;
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- compression = btrfs_file_extent_compression(leaf, ei);
- extent_type = btrfs_file_extent_type(leaf, ei);
- extent_gen = btrfs_file_extent_generation(leaf, ei);
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- if (compression == BTRFS_COMPRESS_NONE)
- extent_offset = btrfs_file_extent_offset(leaf, ei);
- }
-
- if (compression != BTRFS_COMPRESS_NONE)
- flags |= FIEMAP_EXTENT_ENCODED;
-
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- flags |= FIEMAP_EXTENT_DATA_INLINE;
- flags |= FIEMAP_EXTENT_NOT_ALIGNED;
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
- extent_len, flags);
- } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx,
- disk_bytenr, extent_offset,
- extent_gen, key.offset,
- extent_end - 1);
- } else if (disk_bytenr == 0) {
- /* We have an explicit hole. */
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- key.offset, extent_end - 1);
- } else {
- /* We have a regular extent. */
- if (fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- goto out_unlock;
- else if (ret > 0)
- flags |= FIEMAP_EXTENT_SHARED;
- }
-
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
- disk_bytenr + extent_offset,
- extent_len, flags);
- }
-
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* emit_fiemap_extent() told us to stop. */
- stopped = true;
- break;
- }
-
- prev_extent_end = extent_end;
-next_item:
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out_unlock;
- }
- ret = fiemap_next_leaf_item(inode, path);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* No more file extent items for this inode. */
- break;
+ cond_resched();
}
- cond_resched();
}
-
-check_eof_delalloc:
- if (!stopped && prev_extent_end < range_end) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, range_end - 1);
- if (ret < 0)
- goto out_unlock;
- prev_extent_end = range_end;
- }
-
- if (cache.cached && cache.offset + cache.len >= last_extent_end) {
- const u64 i_size = i_size_read(&inode->vfs_inode);
-
- if (prev_extent_end < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode,
- prev_extent_end,
- i_size - 1,
- &delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
- cache.flags |= FIEMAP_EXTENT_LAST;
- } else {
- cache.flags |= FIEMAP_EXTENT_LAST;
- }
- }
-
-out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
- btrfs_release_path(path);
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
- len -= cache.next_search_offset - start;
- start = cache.next_search_offset;
- goto restart;
- } else if (ret < 0) {
- goto out;
- }
-
- /*
- * Must free the path before emitting to the fiemap buffer because we
- * may have a non-cloned leaf and if the fiemap buffer is memory mapped
- * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
- * waiting for an ordered extent that in order to complete needs to
- * modify that leaf, therefore leading to a deadlock.
- */
- btrfs_free_path(path);
- path = NULL;
-
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
-
- ret = emit_last_fiemap_cache(fieinfo, &cache);
-out:
- free_extent_state(delalloc_cached_state);
- kfree(cache.entries);
- btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
- return ret;
+ return try_release_extent_state(io_tree, folio, mask);
}
static void __free_extent_buffer(struct extent_buffer *eb)
@@ -3258,41 +2544,35 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- lockdep_assert_held(&page->mapping->private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- if (PagePrivate(page)) {
- subpage = (struct btrfs_subpage *)page->private;
+ if (folio_test_private(folio)) {
+ subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
- /*
- * Even there is no eb refs here, we may still have
- * end_page_read() call relying on page::private.
- */
- if (atomic_read(&subpage->readers))
- return true;
}
return false;
}
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
- * For mapped eb, we're going to change the page private, which should
- * be done under the private_lock.
+ * For mapped eb, we're going to change the folio private, which should
+ * be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
- if (!PagePrivate(page)) {
+ if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
@@ -3301,66 +2581,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
+ * only clear folio if it's still connected to
* this eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
+ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- detach_page_private(page);
+ BUG_ON(folio_test_dirty(folio));
+ BUG_ON(folio_test_writeback(folio));
+ /* We need to make sure we haven't be attached to a new eb. */
+ folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
/*
- * For subpage, we can have dummy eb with page private. In this case,
- * we can directly detach the private as such page is only attached to
- * one dummy eb, no sharing.
+ * For subpage, we can have dummy eb with folio private attached. In
+ * this case, we can directly detach the private as such folio is only
+ * attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, page);
+ btrfs_detach_subpage(fs_info, folio);
return;
}
- btrfs_page_dec_eb_refs(fs_info, page);
+ btrfs_folio_dec_eb_refs(fs_info, folio);
/*
- * We can only detach the page private if there are no other ebs in the
+ * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!page_range_has_eb(fs_info, page))
- btrfs_detach_subpage(fs_info, page);
+ if (!folio_range_has_eb(fs_info, folio))
+ btrfs_detach_subpage(fs_info, folio);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
}
/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
{
- int i;
- int num_pages;
-
ASSERT(!extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+ struct folio *folio = eb->folios[i];
- if (!page)
+ if (!folio)
continue;
- detach_extent_buffer_page(eb, page);
+ detach_extent_buffer_folio(eb, folio);
- /* One for when we allocated the page */
- put_page(page);
+ /* One for when we allocated the folio. */
+ folio_put(folio);
}
}
@@ -3398,9 +2670,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
- int i;
struct extent_buffer *new;
- int num_pages = num_extent_pages(src);
+ int num_folios = num_extent_folios(src);
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3414,22 +2685,21 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = btrfs_alloc_page_array(num_pages, new->pages);
+ ret = alloc_eb_folio_array(new, false);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
}
- for (i = 0; i < num_pages; i++) {
- int ret;
- struct page *p = new->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = new->folios[i];
- ret = attach_extent_buffer_page(new, p, NULL);
+ ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- WARN_ON(PageDirty(p));
+ WARN_ON(folio_test_dirty(folio));
}
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
@@ -3441,23 +2711,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
- int num_pages;
- int i;
+ int num_folios = 0;
int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
- num_pages = num_extent_pages(eb);
- ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ ret = alloc_eb_folio_array(eb, false);
if (ret)
goto err;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = attach_extent_buffer_page(eb, p, NULL);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
goto err;
}
@@ -3468,10 +2735,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i]) {
- detach_extent_buffer_page(eb, eb->pages[i]);
- __free_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++) {
+ if (eb->folios[i]) {
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3520,20 +2787,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
- struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_pages, i;
+ int num_folios= num_extent_folios(eb);
check_buffer_tree_ref(eb);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (p != accessed)
- mark_page_accessed(p);
- }
+ for (int i = 0; i < num_folios; i++)
+ folio_mark_accessed(eb->folios[i]);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3561,14 +2822,14 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
- mark_extent_buffer_accessed(eb, NULL);
+ mark_extent_buffer_accessed(eb);
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -3604,14 +2865,20 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3621,21 +2888,21 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/* Page not yet attached to an extent buffer */
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
- * just overwrite page->private.
+ * just overwrite folio private.
*/
- exists = (struct extent_buffer *)page->private;
+ exists = folio_get_private(folio);
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
- detach_page_private(page);
+ folio_detach_private(folio);
return NULL;
}
@@ -3660,6 +2927,101 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
start, fs_info->nodesize);
return -EINVAL;
}
+ if (!IS_ALIGNED(start, fs_info->nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ btrfs_warn(fs_info,
+"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
+ start, fs_info->nodesize);
+ }
+ return 0;
+}
+
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
+ struct extent_buffer **found_eb_ret)
+{
+
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ const unsigned long index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio = NULL;
+ int ret;
+
+ ASSERT(found_eb_ret);
+
+ /* Caller should ensure the folio exists. */
+ ASSERT(eb->folios[i]);
+
+retry:
+ ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (!ret)
+ goto finish;
+
+ existing_folio = filemap_lock_folio(mapping, index + i);
+ /* The page cache only exists for a very short time, just retry. */
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
+ goto retry;
+ }
+
+ /* For now, we should only have single-page folios for btree inode. */
+ ASSERT(folio_nr_pages(existing_folio) == 1);
+
+ if (folio_size(existing_folio) != eb->folio_size) {
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return -EAGAIN;
+ }
+
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ } else if (existing_folio) {
+ struct extent_buffer *existing_eb;
+
+ existing_eb = grab_extent_buffer(fs_info,
+ folio_page(existing_folio, 0));
+ if (existing_eb) {
+ /* The extent buffer still exists, we can use it directly. */
+ *found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return 1;
+ }
+ /* The extent buffer no longer exists, we can reuse the folio. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ }
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3667,15 +3029,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
- int num_pages;
- int i;
- unsigned long index = start >> PAGE_SHIFT;
+ int num_folios;
+ int attached = 0;
struct extent_buffer *eb;
- struct extent_buffer *exists = NULL;
- struct page *p;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ struct extent_buffer *existing_eb = NULL;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
+ bool page_contig = true;
int uptodate = 1;
int ret;
@@ -3710,11 +3070,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- num_pages = num_extent_pages(eb);
-
/*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock nor page lock hold.
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
*
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
@@ -3722,47 +3080,73 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
- exists = ERR_CAST(prealloc);
- goto free_eb;
+ ret = PTR_ERR(prealloc);
+ goto out;
}
}
- for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p) {
- exists = ERR_PTR(-ENOMEM);
- btrfs_free_subpage(prealloc);
- goto free_eb;
- }
+reallocate:
+ /* Allocate all pages first. */
+ ret = alloc_eb_folio_array(eb, true);
+ if (ret < 0) {
+ btrfs_free_subpage(prealloc);
+ goto out;
+ }
- spin_lock(&mapping->private_lock);
- exists = grab_extent_buffer(fs_info, p);
- if (exists) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+ num_folios = num_extent_folios(eb);
+ /* Attach all pages to the filemap. */
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio;
+
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
+ if (ret > 0) {
+ ASSERT(existing_eb);
+ goto out;
}
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_page(eb, p, prealloc);
- ASSERT(!ret);
+
/*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the page private
- * when the eb hasn't yet been inserted into radix tree.
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
*
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
+ if (unlikely(ret == -EAGAIN)) {
+ ASSERT(0);
+ goto reallocate;
+ }
+ attached++;
+
+ /*
+ * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+ * reliable, as we may choose to reuse the existing page cache
+ * and free the allocated page.
+ */
+ folio = eb->folios[i];
+ WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+
+ /*
+ * Check if the current page is physically contiguous with previous eb
+ * page.
+ * At this stage, either we allocated a large folio, thus @i
+ * would only be 0, or we fall back to per-page allocation.
*/
- btrfs_page_inc_eb_refs(fs_info, p);
- spin_unlock(&mapping->private_lock);
+ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+ page_contig = false;
- WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
- eb->pages[i] = p;
- if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+ if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
uptodate = 0;
/*
@@ -3775,12 +3159,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ /* All pages are physically contiguous, can skip cross page handling. */
+ if (page_contig)
+ eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
+ if (ret)
+ goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3788,9 +3173,10 @@ again:
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
+ ret = 0;
+ existing_eb = find_extent_buffer(fs_info, start);
+ if (existing_eb)
+ goto out;
else
goto again;
}
@@ -3803,19 +3189,46 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (i = 0; i < num_pages; i++)
- unlock_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++)
+ unlock_page(folio_page(eb->folios[i], 0));
return eb;
-free_eb:
+out:
WARN_ON(!atomic_dec_and_test(&eb->refs));
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
+
+ /*
+ * Any attached folios need to be detached before we unlock them. This
+ * is because when we're inserting our new folios into the mapping, and
+ * then attaching our eb to that folio. If we fail to insert our folio
+ * we'll lookup the folio for that index, and grab that EB. We do not
+ * want that to grab this eb, as we're getting ready to free it. So we
+ * have to detach it first and then unlock it.
+ *
+ * We have to drop our reference and NULL it out here because in the
+ * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+ * Below when we call btrfs_release_extent_buffer() we will call
+ * detach_extent_buffer_folio() on our remaining pages in the !subpage
+ * case. If we left eb->folios[i] populated in the subpage case we'd
+ * double put our reference and be super sad.
+ */
+ for (int i = 0; i < attached; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ unlock_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
}
+ /*
+ * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+ * so it can be cleaned up without utlizing page->mapping.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
btrfs_release_extent_buffer(eb);
- return exists;
+ if (ret < 0)
+ return ERR_PTR(ret);
+ ASSERT(existing_eb);
+ return existing_eb;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3907,31 +3320,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
{
- ASSERT(PageDirty(page));
- ASSERT(PageLocked(page));
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ ASSERT(folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ folio_clear_dirty_for_io(folio);
+ xa_lock_irq(&folio->mapping->i_pages);
+ if (!folio_test_dirty(folio))
+ __xa_clear_mark(&folio->mapping->i_pages,
+ folio_index(folio), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
+ struct folio *folio = eb->folios[0];
bool last;
- /* btree_clear_page_dirty() needs page locked */
- lock_page(page);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
- eb->len);
+ /* btree_clear_folio_dirty() needs page locked. */
+ folio_lock(folio);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
if (last)
- btree_clear_page_dirty(page);
- unlock_page(page);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
WARN_ON(atomic_read(&eb->refs) == 0);
}
@@ -3939,15 +3351,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i;
- int num_pages;
- struct page *page;
+ int num_folios;
btrfs_assert_tree_write_locked(eb);
if (trans && btrfs_header_generation(eb) != trans->transid)
return;
+ /*
+ * Instead of clearing the dirty flag off of the buffer, mark it as
+ * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+ * write-ordering in zoned mode, without the need to later re-dirty
+ * the extent_buffer.
+ *
+ * The actual zeroout of the buffer will happen later in
+ * btree_csum_one_bio.
+ */
+ if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+ return;
+ }
+
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
@@ -3957,32 +3381,32 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
continue;
- lock_page(page);
- btree_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int i;
- int num_pages;
+ int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
@@ -3999,34 +3423,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
* the above race.
*/
if (subpage)
- lock_page(eb->pages[0]);
- for (i = 0; i < num_pages; i++)
- btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
- eb->start, eb->len);
+ lock_page(folio_page(eb->folios[0], 0));
+ for (int i = 0; i < num_folios; i++)
+ btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+ eb->start, eb->len);
if (subpage)
- unlock_page(eb->pages[0]);
+ unlock_page(folio_page(eb->folios[0], 0));
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (i = 0; i < num_pages; i++)
- ASSERT(PageDirty(eb->pages[i]));
+ for (int i = 0; i < num_folios; i++)
+ ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!page)
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (!folio)
continue;
/*
@@ -4034,46 +3456,56 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
else
- btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_clear_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
/*
* This is special handling for metadata subpage, as regular
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
else
- btrfs_subpage_set_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_set_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void clear_extent_buffer_reading(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+}
+
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
+ /*
+ * If the extent buffer is marked UPTODATE before the read operation
+ * completes, other calls to read_extent_buffer_pages() will return
+ * early without waiting for the read to finish, causing data races.
+ */
+ WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
+
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
@@ -4087,32 +3519,30 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
+ struct folio *folio = fi.folio;
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ u32 len = fi.length;
if (uptodate)
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
bio_offset += len;
}
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
- int num_pages = num_extent_pages(eb), i;
struct btrfs_bio *bbio;
+ bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -4136,9 +3566,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
* will now be set, and we shouldn't read it in again.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
return 0;
}
@@ -4149,19 +3577,26 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
- extent_buffer_read_end_io, eb);
+ end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
if (eb->fs_info->nodesize < PAGE_SIZE) {
- __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
- eb->start - page_offset(eb->pages[0]));
+ ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+ eb->start - folio_pos(eb->folios[0]));
+ ASSERT(ret);
} else {
- for (i = 0; i < num_pages; i++)
- __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+ ASSERT(ret);
+ }
}
- btrfs_submit_bio(bbio, mirror_num);
+ btrfs_submit_bbio(bbio, mirror_num);
done:
if (wait == WAIT_COMPLETE) {
@@ -4177,7 +3612,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
btrfs_warn(eb->fs_info,
- "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ "access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
@@ -4206,29 +3641,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
if (check_eb_range(eb, start, len)) {
/*
* Invalid range hit, reset the memory, so callers won't get
- * some random garbage for their uninitialzed memory.
+ * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len);
return;
}
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ memcpy(dstv, eb->addr + start, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
memcpy(dst, kaddr + offset, cur);
dst += cur;
@@ -4242,24 +3681,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (copy_to_user_nofault(dstv, eb->addr + start, len))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
@@ -4277,25 +3721,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = get_eb_offset_in_page(eb, start);
-
- while (len > 0) {
- page = eb->pages[i];
+ if (eb->addr)
+ return memcmp(ptrv, eb->addr + start, len);
- cur = min(len, (PAGE_SIZE - offset));
+ offset = get_eb_offset_in_folio(eb, start);
- kaddr = page_address(page);
+ while (len > 0) {
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
@@ -4314,10 +3758,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
- struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct folio *folio = eb->folios[i];
+
+ ASSERT(folio);
/*
* If we are using the commit root we could potentially clear a page
@@ -4331,11 +3777,13 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
return;
if (fs_info->nodesize < PAGE_SIZE) {
- if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ folio = eb->folios[0];
+ ASSERT(i == 0);
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
- btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+ btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
}
}
@@ -4343,29 +3791,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- char *src = (char *)srcv;
- unsigned long i = get_eb_page_index(start);
+ const char *src = (const char *)srcv;
+ unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
- WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
if (check_eb_range(eb, start, len))
return;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (use_memmove)
+ memmove(eb->addr + start, srcv, len);
+ else
+ memcpy(eb->addr + start, srcv, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
if (check_uptodate)
- assert_eb_page_uptodate(eb, page);
+ assert_eb_folio_uptodate(eb, i);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (use_memmove)
memmove(kaddr + offset, src, cur);
else
@@ -4387,16 +3840,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
unsigned long cur = start;
+ if (eb->addr) {
+ memset(eb->addr + start, c, len);
+ return;
+ }
+
while (cur < start + len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned int offset = get_eb_offset_in_page(eb, cur);
- unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
- struct page *page = eb->pages[index];
+ unsigned long index = get_eb_folio_index(eb, cur);
+ unsigned int offset = get_eb_offset_in_folio(eb, cur);
+ unsigned int cur_len = min(start + len - cur, unit_size - offset);
- assert_eb_page_uptodate(eb, page);
- memset(page_address(page) + offset, c, cur_len);
+ assert_eb_folio_uptodate(eb, index);
+ memset(folio_address(eb->folios[index]) + offset, c, cur_len);
cur += cur_len;
}
@@ -4413,15 +3871,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
+ const int unit_size = src->folio_size;
unsigned long cur = 0;
ASSERT(dst->len == src->len);
while (cur < src->len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned long offset = get_eb_offset_in_page(src, cur);
- unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
- void *addr = page_address(src->pages[index]) + offset;
+ unsigned long index = get_eb_folio_index(src, cur);
+ unsigned long offset = get_eb_offset_in_folio(src, cur);
+ unsigned long cur_len = min(src->len, unit_size - offset);
+ void *addr = folio_address(src->folios[index]) + offset;
write_extent_buffer(dst, addr, cur, cur_len);
@@ -4434,12 +3893,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = dst->folio_size;
u64 dst_len = dst->len;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- unsigned long i = get_eb_page_index(dst_offset);
+ unsigned long i = get_eb_folio_index(dst, dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -4447,15 +3906,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = get_eb_offset_in_page(dst, dst_offset);
+ offset = get_eb_offset_in_folio(dst, dst_offset);
while (len > 0) {
- page = dst->pages[i];
- assert_eb_page_uptodate(dst, page);
+ assert_eb_folio_uptodate(dst, i);
- cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+ cur = min(len, (unsigned long)(unit_size - offset));
- kaddr = page_address(page);
+ kaddr = folio_address(dst->folios[i]);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
@@ -4466,22 +3924,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * eb_bitmap_offset() - calculate the page and offset of the byte containing the
- * given bit number
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains the
- * given bit number
- * @page_offset: return offset into the page given by page_index
+ * Calculate the folio and offset of the byte containing the given bit number.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @folio_index: return index of the folio in the extent buffer that contains
+ * the given bit number
+ * @folio_offset: return offset into the folio given by folio_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
- unsigned long *page_index,
- size_t *page_offset)
+ unsigned long *folio_index,
+ size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -4491,10 +3949,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_page(eb->start) + byte_offset;
+ offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
- *page_index = offset >> PAGE_SHIFT;
- *page_offset = offset_in_page(offset);
+ *folio_index = offset >> eb->folio_shift;
+ *folio_offset = offset_in_eb_folio(eb, offset);
}
/*
@@ -4507,25 +3965,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
- u8 *kaddr;
- struct page *page;
unsigned long i;
size_t offset;
+ u8 *kaddr;
eb_bitmap_offset(eb, start, nr, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ assert_eb_folio_uptodate(eb, i);
+ kaddr = folio_address(eb->folios[i]);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
- unsigned long index = get_eb_page_index(bytenr);
+ unsigned long index = get_eb_folio_index(eb, bytenr);
if (check_eb_range(eb, bytenr, 1))
return NULL;
- return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+ return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}
/*
@@ -4610,19 +4066,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = dst->folio_size;
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+ if (dst->addr) {
+ const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+ if (use_memmove)
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ else
+ memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (cur_off < len) {
unsigned long cur_src = cur_off + src_offset;
- unsigned long pg_index = get_eb_page_index(cur_src);
- unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+ unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
unsigned long cur_len = min(src_offset + len - cur_src,
- PAGE_SIZE - pg_off);
- void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ unit_size - folio_off);
+ void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
const bool use_memmove = areas_overlap(src_offset + cur_off,
dst_offset + cur_off, cur_len);
@@ -4648,24 +4115,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
+ if (dst->addr) {
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (len > 0) {
unsigned long src_i;
size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
+ size_t dst_off_in_folio;
+ size_t src_off_in_folio;
void *src_addr;
bool use_memmove;
- src_i = get_eb_page_index(src_end);
+ src_i = get_eb_folio_index(dst, src_end);
- dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
- src_off_in_page = get_eb_offset_in_page(dst, src_end);
+ dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+ src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
- cur = min_t(unsigned long, len, src_off_in_page + 1);
- cur = min(cur, dst_off_in_page + 1);
+ cur = min_t(unsigned long, len, src_off_in_folio + 1);
+ cur = min(cur, dst_off_in_folio + 1);
- src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
- cur + 1;
+ src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+ cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
@@ -4680,17 +4152,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+ const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
{
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
- u64 page_start = page_offset(page);
- u64 cur = page_start;
+ u64 folio_start = folio_pos(folio);
+ u64 cur = folio_start;
- ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- while (cur < page_start + PAGE_SIZE) {
+ while (cur < folio_start + PAGE_SIZE) {
int ret;
int i;
@@ -4702,7 +4174,7 @@ static struct extent_buffer *get_next_extent_buffer(
goto out;
for (i = 0; i < ret; i++) {
/* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
+ if (gang[i]->start >= folio_start + PAGE_SIZE)
goto out;
/* Found one */
if (gang[i]->start >= bytenr) {
@@ -4716,18 +4188,18 @@ out:
return found;
}
-static int try_release_subpage_extent_buffer(struct page *page)
+static int try_release_subpage_extent_buffer(struct folio *folio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
- u64 cur = page_offset(page);
- const u64 end = page_offset(page) + PAGE_SIZE;
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ u64 cur = folio_pos(folio);
+ const u64 end = cur + PAGE_SIZE;
int ret;
while (cur < end) {
struct extent_buffer *eb = NULL;
/*
- * Unlike try_release_extent_buffer() which uses page->private
+ * Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
@@ -4735,7 +4207,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
* with spinlock rather than RCU.
*/
spin_lock(&fs_info->buffer_lock);
- eb = get_next_extent_buffer(fs_info, page, cur);
+ eb = get_next_extent_buffer(fs_info, folio, cur);
if (!eb) {
/* No more eb in the page range after or at cur */
spin_unlock(&fs_info->buffer_lock);
@@ -4767,43 +4239,43 @@ static int try_release_subpage_extent_buffer(struct page *page)
/*
* Here we don't care about the return value, we will always
- * check the page private at the end. And
+ * check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
- * Finally to check if we have cleared page private, as if we have
- * released all ebs in the page, the page private should be cleared now.
+ * Finally to check if we have cleared folio private, as if we have
+ * released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page))
+ spin_lock(&folio->mapping->i_private_lock);
+ if (!folio_test_private(folio))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return ret;
}
-int try_release_extent_buffer(struct page *page)
+int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return try_release_subpage_extent_buffer(page);
+ if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(folio);
/*
- * We need to make sure nobody is changing page->private, as we rely on
- * page->private as the pointer to extent buffer.
+ * We need to make sure nobody is changing folio private, as we rely on
+ * folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&folio->mapping->i_private_lock);
return 1;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
/*
@@ -4814,10 +4286,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
@@ -4832,7 +4304,8 @@ int try_release_extent_buffer(struct page *page)
}
/*
- * btrfs_readahead_tree_block - attempt to readahead a child block
+ * Attempt to readahead a child block.
+ *
* @fs_info: the fs_info
* @bytenr: bytenr to read
* @owner_root: objectid of the root that owns this eb
@@ -4871,7 +4344,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_readahead_node_child - readahead a node's child block
+ * Readahead a node's child block.
+ *
* @node: parent node we're reading from
* @slot: slot in the parent node for the child we want to read
*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..039a73731135 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,11 +7,33 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include "compression.h"
+#include "messages.h"
#include "ulist.h"
#include "misc.h"
+struct page;
+struct file;
+struct folio;
+struct inode;
+struct fiemap_extent_info;
+struct readahead_control;
+struct address_space;
+struct writeback_control;
+struct extent_io_tree;
+struct extent_map_tree;
+struct extent_state;
+struct btrfs_block_group;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
@@ -28,7 +50,8 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
- EXTENT_BUFFER_NO_CHECK,
+ /* Indicate the extent buffer is written zeroed out (for zoned) */
+ EXTENT_BUFFER_ZONED_ZEROOUT,
/* Indicate that extent buffer pages a being read */
EXTENT_BUFFER_READING,
};
@@ -43,10 +66,10 @@ enum {
};
/*
- * page->private values. Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values. Every page that is controlled by the extent map has
+ * folio private set to this value.
*/
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
@@ -56,17 +79,12 @@ enum {
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
-#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct btrfs_root;
-struct btrfs_inode;
-struct btrfs_fs_info;
-struct extent_io_tree;
-struct btrfs_tree_parent_check;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
@@ -74,22 +92,36 @@ void __cold extent_buffer_free_cachep(void);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
- unsigned long len;
+ u32 len;
+ u32 folio_size;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * The address where the eb can be accessed without any cross-page handling.
+ * This can be NULL if not possible.
+ */
+ void *addr;
+
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
- struct rcu_head rcu_head;
- pid_t lock_owner;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
+ u8 folio_shift;
+ struct rcu_head rcu_head;
struct rw_semaphore lock;
- struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ /*
+ * Pointers to all the folios of the extent buffer.
+ *
+ * For now the folio is always order 0 (aka, a single page).
+ */
+ struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
+ pid_t lock_owner;
#endif
};
@@ -100,6 +132,13 @@ struct btrfs_eb_write_context {
struct btrfs_block_group *zoned_bg;
};
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+ u64 start)
+{
+ ASSERT(eb->folio_size);
+ return start & (eb->folio_size - 1);
+}
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -108,29 +147,43 @@ struct btrfs_eb_write_context {
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb
+ * The eb->start is aligned to folio size, thus adding it
+ * won't cause any difference.
+ * 1.2) Several page sized folios
+ * The eb->start is aligned to folio (page) size, thus
+ * adding it won't cause any difference.
*
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * In this case there would only be one page sized folio, and there
+ * may be several different extent buffers in the page/folio.
+ * We need to add eb->start to properly access the offset inside
+ * that eb.
*/
- return offset_in_page(offset + eb->start);
+ return offset_in_folio(eb->folios[0], offset + eb->start);
}
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb.
+ * the folio_shift would be large enough to always make us
+ * return 0 as index.
+ * 1.2) Several page sized folios
+ * The folio_shift would be PAGE_SHIFT, giving us the correct
+ * index.
*
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> PAGE_SHIFT;
+ return offset >> eb->folio_shift;
}
/*
@@ -162,6 +215,11 @@ static inline struct extent_changeset *extent_changeset_alloc(void)
return ret;
}
+static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask)
+{
+ ulist_prealloc(&changeset->range_changed, gfp_mask);
+}
+
static inline void extent_changeset_release(struct extent_changeset *changeset)
{
if (!changeset)
@@ -178,24 +236,20 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-struct extent_map_tree;
-
-int try_release_extent_mapping(struct page *page, gfp_t mask);
-int try_release_extent_buffer(struct page *page);
+bool try_release_extent_mapping(struct folio *folio, gfp_t mask);
+int try_release_extent_buffer(struct folio *folio);
int btrfs_read_folio(struct file *file, struct folio *folio);
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len);
+void btrfs_readahead(struct readahead_control *rac);
+int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
-void clear_page_extent_mapped(struct page *page);
+void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
@@ -212,7 +266,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *parent_check);
+ const struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@@ -230,6 +284,22 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
return (eb->len >> PAGE_SHIFT) ?: 1;
}
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+ if (!eb->folios[0])
+ return 0;
+ if (folio_order(eb->folios[0]))
+ return 1;
+ return num_extent_pages(eb);
+}
+
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -285,20 +355,22 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct folio *locked_folio,
+ struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ bool nofail);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
- struct page *locked_page, u64 *start,
+ struct folio *locked_folio, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8c017c4105f2..36af9aa9aab1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -5,10 +5,10 @@
#include <linux/spinlock.h>
#include "messages.h"
#include "ctree.h"
-#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
#include "btrfs_inode.h"
+#include "disk-io.h"
static struct kmem_cache *extent_map_cache;
@@ -16,8 +16,7 @@ static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
- sizeof(struct extent_map), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_map), 0, 0, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
@@ -34,7 +33,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT_CACHED;
+ tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -50,7 +49,6 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->compress_type = BTRFS_COMPRESS_NONE;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -67,8 +65,6 @@ void free_extent_map(struct extent_map *em)
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -81,27 +77,32 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
+static void dec_evictable_extent_maps(struct btrfs_inode *inode)
{
- struct rb_node **p = &root->rb_root.rb_node;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ percpu_counter_dec(&fs_info->evictable_extent_maps);
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
+{
+ struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
- bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start) {
+ if (em->start < entry->start)
p = &(*p)->rb_left;
- } else if (em->start >= extent_map_end(entry)) {
+ else if (em->start >= extent_map_end(entry))
p = &(*p)->rb_right;
- leftmost = false;
- } else {
+ else
return -EEXIST;
- }
}
orig_parent = parent;
@@ -124,7 +125,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color_cached(&em->rb_node, root, leftmost);
+ rb_insert_color(&em->rb_node, root);
return 0;
}
@@ -182,54 +183,163 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_len(const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
- return 0;
+ if (extent_map_is_compressed(em))
+ return em->disk_num_bytes;
+ return em->len;
+}
- /*
- * don't merge compressed extents, we need to know their
- * actual size
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
- return 0;
+static inline u64 extent_map_block_end(const struct extent_map *em)
+{
+ const u64 block_start = extent_map_block_start(em);
+ const u64 block_end = block_start + extent_map_block_len(em);
- if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
- test_bit(EXTENT_FLAG_LOGGING, &next->flags))
- return 0;
+ if (block_end < block_start)
+ return (u64)-1;
+
+ return block_end;
+}
+
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_PINNED)
+ return false;
+
+ /* Don't merge compressed extents, we need to know their actual size. */
+ if (extent_map_is_compressed(em))
+ return false;
+
+ if (em->flags & EXTENT_FLAG_LOGGING)
+ return false;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
- if (!list_empty(&prev->list) || !list_empty(&next->list))
- return 0;
+ if (!list_empty(&em->list))
+ return false;
+
+ return true;
+}
+
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+ if (extent_map_end(prev) != next->start)
+ return false;
+
+ /*
+ * The merged flag is not an on-disk flag, it just indicates we had the
+ * extent maps of 2 (or more) adjacent extents merged, so factor it out.
+ */
+ if ((prev->flags & ~EXTENT_FLAG_MERGED) !=
+ (next->flags & ~EXTENT_FLAG_MERGED))
+ return false;
+
+ if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
+ return extent_map_block_start(next) == extent_map_block_end(prev);
+
+ /* HOLES and INLINE extents. */
+ return next->disk_bytenr == prev->disk_bytenr;
+}
+
+/*
+ * Handle the on-disk data extents merge for @prev and @next.
+ *
+ * @prev: left extent to merge
+ * @next: right extent to merge
+ * @merged: the extent we will not discard after the merge; updated with new values
+ *
+ * After this, one of the two extents is the new merged extent and the other is
+ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
+ * so there is const/non-const aliasing occurring here.
+ *
+ * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
+ * For now only uncompressed regular extent can be merged.
+ */
+static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
+ struct extent_map *merged)
+{
+ u64 new_disk_bytenr;
+ u64 new_disk_num_bytes;
+ u64 new_offset;
+
+ /* @prev and @next should not be compressed. */
+ ASSERT(!extent_map_is_compressed(prev));
+ ASSERT(!extent_map_is_compressed(next));
+
+ /*
+ * There are two different cases where @prev and @next can be merged.
+ *
+ * 1) They are referring to the same data extent:
+ *
+ * |<----- data extent A ----->|
+ * |<- prev ->|<- next ->|
+ *
+ * 2) They are referring to different data extents but still adjacent:
+ *
+ * |<-- data extent A -->|<-- data extent B -->|
+ * |<- prev ->|<- next ->|
+ *
+ * The calculation here always merges the data extents first, then updates
+ * @offset using the new data extents.
+ *
+ * For case 1), the merged data extent would be the same.
+ * For case 2), we just merge the two data extents into one.
+ */
+ new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr);
+ new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes,
+ next->disk_bytenr + next->disk_num_bytes) -
+ new_disk_bytenr;
+ new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
+
+ merged->disk_bytenr = new_disk_bytenr;
+ merged->disk_num_bytes = new_disk_num_bytes;
+ merged->ram_bytes = new_disk_num_bytes;
+ merged->offset = new_offset;
+}
- ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
- prev->block_start != EXTENT_MAP_DELALLOC);
-
- if (prev->map_lookup || next->map_lookup)
- ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
- test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
- if (extent_map_end(prev) == next->start &&
- prev->flags == next->flags &&
- prev->map_lookup == next->map_lookup &&
- ((next->block_start == EXTENT_MAP_HOLE &&
- prev->block_start == EXTENT_MAP_HOLE) ||
- (next->block_start == EXTENT_MAP_INLINE &&
- prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
- next->block_start == extent_map_block_end(prev)))) {
- return 1;
+static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
+ struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ btrfs_crit(fs_info,
+"%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x",
+ prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes,
+ em->ram_bytes, em->offset, em->flags);
+ ASSERT(0);
+}
+
+/* Internal sanity checks for btrfs debug builds. */
+static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_num_bytes == 0)
+ dump_extent_map(fs_info, "zero disk_num_bytes", em);
+ if (em->offset + em->len > em->ram_bytes)
+ dump_extent_map(fs_info, "ram_bytes too small", em);
+ if (em->offset + em->len > em->disk_num_bytes &&
+ !extent_map_is_compressed(em))
+ dump_extent_map(fs_info, "disk_num_bytes too small", em);
+ if (!extent_map_is_compressed(em) &&
+ em->ram_bytes != em->disk_num_bytes)
+ dump_extent_map(fs_info,
+ "ram_bytes mismatch with disk_num_bytes for non-compressed em",
+ em);
+ } else if (em->offset) {
+ dump_extent_map(fs_info, "non-zero offset for hole/inline", em);
}
- return 0;
}
-static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -244,46 +354,51 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (refcount_read(&em->refs) > 2)
return;
+ if (!can_merge_extent_map(em))
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
- em->orig_start = merge->orig_start;
em->len += merge->len;
- em->block_len += merge->block_len;
- em->block_start = merge->block_start;
- em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
- em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
- rb_erase_cached(&merge->rb_node, &tree->map);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(merge, em, em);
+ em->flags |= EXTENT_FLAG_MERGED;
+
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
- em->block_len += merge->block_len;
- rb_erase_cached(&merge->rb_node, &tree->map);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(em, merge, em);
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
- em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
/*
* Unpin an extent from the cache.
*
- * @tree: tree to unpin the extent in
+ * @inode: the inode from which we are unpinning an extent range
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
@@ -291,133 +406,108 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
* Called after an extent has been written to disk properly. Set the generation
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
+ *
+ * Returns: 0 on success
+ * -ENOENT when the extent is not found in the tree
+ * -EUCLEAN if the found extent does not match the expected start
*/
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
- u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
- bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(!em || em->start != start);
-
- if (!em)
+ if (WARN_ON(!em)) {
+ btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ start, start + len, gen);
+ ret = -ENOENT;
goto out;
+ }
- em->generation = gen;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- em->mod_start = em->start;
- em->mod_len = em->len;
-
- if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
- prealloc = true;
- clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ if (WARN_ON(em->start != start)) {
+ btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ em->start, start, start + len, gen);
+ ret = -EUCLEAN;
+ goto out;
}
- try_merge_map(tree, em);
+ em->generation = gen;
+ em->flags &= ~EXTENT_FLAG_PINNED;
- if (prealloc) {
- em->mod_start = em->start;
- em->mod_len = em->len;
- }
+ try_merge_map(inode, em);
- free_extent_map(em);
out:
write_unlock(&tree->lock);
+ free_extent_map(em);
return ret;
}
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
- lockdep_assert_held_write(&tree->lock);
+ lockdep_assert_held_write(&inode->extent_tree.lock);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
-static inline void setup_extent_mapping(struct extent_map_tree *tree,
+static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
int modified)
{
refcount_inc(&em->refs);
- em->mod_start = em->start;
- em->mod_len = em->len;
+
+ ASSERT(list_empty(&em->list));
if (modified)
- list_move(&em->list, &tree->modified_extents);
+ list_add(&em->list, &inode->extent_tree.modified_extents);
else
- try_merge_map(tree, em);
-}
-
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
- }
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
- }
+ try_merge_map(inode, em);
}
/*
- * Add new extent map to the extent tree
+ * Add a new extent map to an inode's extent map tree.
*
- * @tree: tree to insert new map in
+ * @inode: the target inode
* @em: map to insert
* @modified: indicate whether the given @em should be added to the
* modified list, which indicates the extent needs to be logged
*
- * Insert @em into @tree or perform a simple forward/backward merge with
- * existing mappings. The extent_map struct passed in will be inserted
- * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successful.
+ * Insert @em into the @inode's extent map tree or perform a simple
+ * forward/backward merge with existing mappings. The extent_map struct passed
+ * in will be inserted into the tree directly, with an additional reference
+ * taken, or a reference dropped if the merge attempt was successful.
*/
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified)
+static int add_extent_mapping(struct btrfs_inode *inode,
+ struct extent_map *em, int modified)
{
- int ret = 0;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
lockdep_assert_held_write(&tree->lock);
- ret = tree_insert(&tree->map, em);
+ validate_extent_map(fs_info, em);
+ ret = tree_insert(&tree->root, em);
if (ret)
- goto out;
+ return ret;
- setup_extent_mapping(tree, em, modified);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
- extent_map_device_set_bits(em, CHUNK_ALLOCATED);
- extent_map_device_clear_bits(em, CHUNK_TRIMMED);
- }
-out:
- return ret;
+ setup_extent_mapping(inode, em, modified);
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ percpu_counter_inc(&fs_info->evictable_extent_maps);
+
+ return 0;
}
static struct extent_map *
@@ -429,7 +519,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
+ rb_node = __tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -483,42 +573,49 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
}
/*
- * Remove an extent_map from the extent tree.
+ * Remove an extent_map from its inode's extent tree.
*
- * @tree: extent tree to remove from
+ * @inode: the inode the extent map belongs to
* @em: extent map being removed
*
- * Remove @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use.
+ * Remove @em from the extent tree of @inode. No reference counts are dropped,
+ * and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- rb_erase_cached(&em->rb_node, &tree->map);
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ WARN_ON(em->flags & EXTENT_FLAG_PINNED);
+ rb_erase(&em->rb_node, &tree->root);
+ if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
+
+ dec_evictable_extent_maps(inode);
}
-static void replace_extent_mapping(struct extent_map_tree *tree,
+static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
int modified)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ validate_extent_map(fs_info, new);
+
+ WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
- if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
- rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
RB_CLEAR_NODE(&cur->rb_node);
- setup_extent_mapping(tree, new, modified);
+ setup_extent_mapping(inode, new, modified);
}
static struct extent_map *next_extent_map(const struct extent_map *em)
@@ -547,7 +644,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
*/
-static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+static noinline int merge_extent_mapping(struct btrfs_inode *inode,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
@@ -558,7 +655,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
u64 end;
u64 start_diff;
- BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+ if (map_start < em->start || map_start >= extent_map_end(em))
+ return -EINVAL;
if (existing->start > map_start) {
next = existing;
@@ -575,19 +673,15 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
start_diff = start - em->start;
em->start = start;
em->len = end - start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- em->block_start += start_diff;
- em->block_len = em->len;
- }
- return add_extent_mapping(em_tree, em, 0);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ em->offset += start_diff;
+ return add_extent_mapping(inode, em, 0);
}
/*
- * Add extent mapping into em_tree.
+ * Add extent mapping into an inode's extent map tree.
*
- * @fs_info: the filesystem
- * @em_tree: extent tree into which we want to insert the extent mapping
+ * @inode: target inode
* @em_in: extent we are inserting
* @start: start of the logical range btrfs_get_extent() is requesting
* @len: length of the logical range btrfs_get_extent() is requesting
@@ -595,8 +689,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
*
- * Insert @em_in into @em_tree. In case there is an overlapping range, handle
- * the -EEXIST by either:
+ * Insert @em_in into the inode's extent map tree. In case there is an
+ * overlapping range, handle the -EEXIST by either:
* a) Returning the existing extent in @em_in if @start is within the
* existing em.
* b) Merge the existing extent with @em_in passed in.
@@ -604,21 +698,21 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
struct extent_map *em = *em_in;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/*
* Tree-checker should have rejected any inline extent with non-zero
* file offset. Here just do a sanity check.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = add_extent_mapping(inode, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -626,9 +720,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- ret = 0;
-
- existing = search_extent_mapping(em_tree, start, len);
+ existing = search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -649,15 +741,14 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
*/
- ret = merge_extent_mapping(em_tree, existing,
- em, start);
- if (ret) {
+ ret = merge_extent_mapping(inode, existing, em, start);
+ if (WARN_ON(ret)) {
free_extent_map(em);
*em_in = NULL;
- WARN_ONCE(ret,
-"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
- ret, existing->start, existing->len,
- orig_start, orig_len);
+ btrfs_warn(fs_info,
+"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
+ existing->start, extent_map_end(existing),
+ orig_start, orig_start + orig_len, start);
}
free_extent_map(existing);
}
@@ -672,20 +763,26 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* if needed. This avoids searching the tree, from the root down to the first
* extent map, before each deletion.
*/
-static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+ struct rb_node *node;
+
write_lock(&tree->lock);
- while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ node = rb_first(&tree->root);
+ while (node) {
struct extent_map *em;
- struct rb_node *node;
+ struct rb_node *next = rb_next(node);
- node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(tree, em);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
+ remove_extent_mapping(inode, em);
free_extent_map(em);
- cond_resched_rwlock_write(&tree->lock);
+
+ if (cond_resched_rwlock_write(&tree->lock))
+ node = rb_first(&tree->root);
+ else
+ node = next;
}
write_unlock(&tree->lock);
}
@@ -716,7 +813,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
WARN_ON(end < start);
if (end == (u64)-1) {
if (start == 0 && !skip_pinned) {
- drop_all_extent_maps_fast(em_tree);
+ drop_all_extent_maps_fast(inode);
return;
}
len = (u64)-1;
@@ -746,7 +843,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
u64 gen;
unsigned long flags;
bool modified;
- bool compressed;
if (em_end < end) {
next_em = next_extent_map(em);
@@ -758,19 +854,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
}
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
start = em_end;
goto next;
}
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/*
* In case we split the extent map, we want to preserve the
* EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
* it on the new extent maps.
*/
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
modified = !list_empty(&em->list);
/*
@@ -781,7 +876,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
if (em->start < start) {
if (!split) {
@@ -793,29 +887,21 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->start = em->start;
split->len = start - em->start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset;
split->ram_bytes = em->ram_bytes;
} else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
}
split->generation = gen;
split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
+ replace_extent_mapping(inode, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -829,41 +915,26 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
split->start = end;
split->len = em_end - end;
- split->block_start = em->block_start;
+ split->disk_bytenr = em->disk_bytenr;
split->flags = flags;
- split->compress_type = em->compress_type;
split->generation = gen;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset + end - em->start;
split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->orig_start = em->orig_start;
- } else {
- const u64 diff = end - em->start;
-
- split->block_len = split->len;
- split->block_start += diff;
- split->orig_start = em->orig_start;
- }
} else {
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->orig_block_len = 0;
}
if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
+ replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
- ret = add_extent_mapping(em_tree, split,
- modified);
+ ret = add_extent_mapping(inode, split, modified);
/* Logic error, shouldn't happen. */
ASSERT(ret == 0);
if (WARN_ON(ret != 0) && modified)
@@ -898,7 +969,7 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
}
/*
@@ -953,7 +1024,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
do {
btrfs_drop_extent_map_range(inode, new_em->start, end, false);
write_lock(&tree->lock);
- ret = add_extent_mapping(tree, new_em, modified);
+ ret = add_extent_mapping(inode, new_em, modified);
write_unlock(&tree->lock);
} while (ret == -EEXIST);
@@ -997,28 +1068,26 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
}
ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
+ ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
+ ASSERT(em->flags & EXTENT_FLAG_PINNED);
+ ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = pre;
- split_pre->orig_start = split_pre->start;
- split_pre->block_start = new_logical;
- split_pre->block_len = split_pre->len;
- split_pre->orig_block_len = split_pre->block_len;
+ split_pre->disk_bytenr = new_logical;
+ split_pre->disk_num_bytes = split_pre->len;
+ split_pre->offset = 0;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, 1);
/*
* Now we only have an extent_map at:
@@ -1028,15 +1097,13 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->orig_start = split_mid->start;
- split_mid->block_start = em->block_start + pre;
- split_mid->block_len = split_mid->len;
- split_mid->orig_block_len = split_mid->block_len;
+ split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_num_bytes = split_mid->len;
+ split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
+ add_extent_mapping(inode, split_mid, 1);
/* Once for us */
free_extent_map(em);
@@ -1051,3 +1118,303 @@ out_free_pre:
free_extent_map(split_pre);
return ret;
}
+
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
+ struct extent_map_tree *tree = &inode->extent_tree;
+ long nr_dropped = 0;
+ struct rb_node *node;
+
+ lockdep_assert_held_write(&tree->lock);
+
+ /*
+ * Take the mmap lock so that we serialize with the inode logging phase
+ * of fsync because we may need to set the full sync flag on the inode,
+ * in case we have to remove extent maps in the tree's list of modified
+ * extents. If we set the full sync flag in the inode while an fsync is
+ * in progress, we may risk missing new extents because before the flag
+ * is set, fsync decides to only wait for writeback to complete and then
+ * during inode logging it sees the flag set and uses the subvolume tree
+ * to find new extents, which may not be there yet because ordered
+ * extents haven't completed yet.
+ *
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
+ */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ return 0;
+
+ node = rb_first(&tree->root);
+ while (node) {
+ struct rb_node *next = rb_next(node);
+ struct extent_map *em;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ ctx->scanned++;
+
+ if (em->flags & EXTENT_FLAG_PINNED)
+ goto next;
+
+ /*
+ * If the inode is in the list of modified extents (new) and its
+ * generation is the same (or is greater than) the current fs
+ * generation, it means it was not yet persisted so we have to
+ * set the full sync flag so that the next fsync will not miss
+ * it.
+ */
+ if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
+ btrfs_set_inode_full_sync(inode);
+
+ remove_extent_mapping(inode, em);
+ trace_btrfs_extent_map_shrinker_remove_em(inode, em);
+ /* Drop the reference for the tree. */
+ free_extent_map(em);
+ nr_dropped++;
+next:
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock.
+ */
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
+ break;
+ node = next;
+ }
+ up_read(&inode->i_mmap_lock);
+
+ return nr_dropped;
+}
+
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_inode *inode;
+ long nr_dropped = 0;
+ u64 min_ino = ctx->last_ino + 1;
+
+ inode = find_first_inode_to_shrink(root, min_ino);
+ while (inode) {
+ nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
+
+ min_ino = btrfs_ino(inode) + 1;
+ ctx->last_ino = btrfs_ino(inode);
+ iput(&inode->vfs_inode);
+
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(fs_info))
+ break;
+
+ cond_resched();
+
+ inode = find_first_inode_to_shrink(root, min_ino);
+ }
+
+ if (inode) {
+ /*
+ * There are still inodes in this root or we happened to process
+ * the last one and reached the scan limit. In either case set
+ * the current root to this one, so we'll resume from the next
+ * inode if there is one or we will find out this was the last
+ * one and move to the next root.
+ */
+ ctx->last_root = btrfs_root_id(root);
+ } else {
+ /*
+ * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
+ * that when processing the next root we start from its first inode.
+ */
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
+ }
+
+ return nr_dropped;
+}
+
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
+ bool cycled = false;
+ long nr_dropped = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
+
+ if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
+ struct btrfs_root *root;
+ unsigned long count;
+
+ cond_resched();
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)&root,
+ (unsigned long)next_root_id, 1);
+ if (count == 0) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (start_root_id > 0 && !cycled) {
+ next_root_id = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
+ cycled = true;
+ continue;
+ }
+ break;
+ }
+ next_root_id = btrfs_root_id(root) + 1;
+ root = btrfs_grab_root(root);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (!root)
+ continue;
+
+ if (is_fstree(btrfs_root_id(root)))
+ nr_dropped += btrfs_scan_root(root, &ctx);
+
+ btrfs_put_root(root);
+ }
+
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,45 +3,91 @@
#ifndef BTRFS_EXTENT_MAP_H
#define BTRFS_EXTENT_MAP_H
+#include <linux/compiler_types.h>
+#include <linux/spinlock_types.h>
#include <linux/rbtree.h>
+#include <linux/list.h>
#include <linux/refcount.h>
+#include "misc.h"
+#include "compression.h"
+
+struct btrfs_inode;
+struct btrfs_fs_info;
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
enum {
/* this entry not yet on disk, don't free it */
- EXTENT_FLAG_PINNED,
- EXTENT_FLAG_COMPRESSED,
+ ENUM_BIT(EXTENT_FLAG_PINNED),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
/* pre-allocated extent */
- EXTENT_FLAG_PREALLOC,
+ ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
- EXTENT_FLAG_LOGGING,
- /* Filling in a preallocated extent */
- EXTENT_FLAG_FILLING,
- /* filesystem extent mapping type */
- EXTENT_FLAG_FS_MAPPING,
+ ENUM_BIT(EXTENT_FLAG_LOGGING),
/* This em is merged from two or more physically adjacent ems */
- EXTENT_FLAG_MERGED,
+ ENUM_BIT(EXTENT_FLAG_MERGED),
};
+/*
+ * This structure represents file extents and holes.
+ *
+ * Unlike on-disk file extent items, extent maps can be merged to save memory.
+ * This means members only match file extent items before any merging.
+ *
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
struct extent_map {
struct rb_node rb_node;
- /* all of these are in bytes */
+ /* All of these are in bytes. */
+
+ /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
u64 start;
+
+ /*
+ * Length of the file extent.
+ *
+ * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
+ * For inline extents it's sectorsize, since inline data starts at
+ * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
+ * btrfs_file_extent_item::num_bytes is not valid.
+ */
u64 len;
- u64 mod_start;
- u64 mod_len;
- u64 orig_start;
- u64 orig_block_len;
+
+ /*
+ * The bytenr of the full on-disk extent.
+ *
+ * For regular extents it's btrfs_file_extent_item::disk_bytenr.
+ * For holes it's EXTENT_MAP_HOLE and for inline extents it's
+ * EXTENT_MAP_INLINE.
+ */
+ u64 disk_bytenr;
+
+ /*
+ * The full on-disk extent length, matching
+ * btrfs_file_extent_item::disk_num_bytes.
+ */
+ u64 disk_num_bytes;
+
+ /*
+ * Offset inside the decompressed extent.
+ *
+ * For regular extents it's btrfs_file_extent_item::offset.
+ * For holes and inline extents it's 0.
+ */
+ u64 offset;
+
+ /*
+ * The decompressed size of the whole on-disk extent, matching
+ * btrfs_file_extent_item::ram_bytes.
+ */
u64 ram_bytes;
- u64 block_start;
- u64 block_len;
/*
* Generation of the extent map, for merged em it's the highest
@@ -49,47 +95,81 @@ struct extent_map {
* For non-merged extents, it's from btrfs_file_extent_item::generation.
*/
u64 generation;
- unsigned long flags;
- /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
- struct map_lookup *map_lookup;
+ u32 flags;
refcount_t refs;
- unsigned int compress_type;
struct list_head list;
};
struct extent_map_tree {
- struct rb_root_cached map;
+ struct rb_root root;
struct list_head modified_extents;
rwlock_t lock;
};
struct btrfs_inode;
+static inline void extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
+{
+ if (type == BTRFS_COMPRESS_ZLIB)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ else if (type == BTRFS_COMPRESS_LZO)
+ em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+ else if (type == BTRFS_COMPRESS_ZSTD)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+ return BTRFS_COMPRESS_ZLIB;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+ return BTRFS_COMPRESS_LZO;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+ return BTRFS_COMPRESS_ZSTD;
+
+ return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+ return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+ EXTENT_FLAG_COMPRESS_LZO |
+ EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_block_start(const struct extent_map *em)
{
- if (em->start + em->len < em->start)
- return (u64)-1;
- return em->start + em->len;
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (extent_map_is_compressed(em))
+ return em->disk_bytenr;
+ return em->disk_bytenr + em->offset;
+ }
+ return em->disk_bytenr;
}
-static inline u64 extent_map_block_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
{
- if (em->block_start + em->block_len < em->block_start)
+ if (em->start + em->len < em->start)
return (u64)-1;
- return em->block_start + em->block_len;
+ return em->start + em->len;
}
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified);
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -97,12 +177,11 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
u64 start, u64 end,
@@ -110,5 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
new file mode 100644
index 000000000000..df7f09f3b02e
--- /dev/null
+++ b/fs/btrfs/fiemap.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "backref.h"
+#include "btrfs_inode.h"
+#include "fiemap.h"
+#include "file.h"
+#include "file-item.h"
+
+struct btrfs_fiemap_entry {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+};
+
+/*
+ * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
+ * range from the inode's io tree, unlock the subvolume tree search path, flush
+ * the fiemap cache and relock the file range and research the subvolume tree.
+ * The value here is something negative that can't be confused with a valid
+ * errno value and different from 1 because that's also a return value from
+ * fiemap_fill_next_extent() and also it's often used to mean some btree search
+ * did not find a key, so make it some distinct negative value.
+ */
+#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
+
+/*
+ * Used to:
+ *
+ * - Cache the next entry to be emitted to the fiemap buffer, so that we can
+ * merge extents that are contiguous and can be grouped as a single one;
+ *
+ * - Store extents ready to be written to the fiemap buffer in an intermediary
+ * buffer. This intermediary buffer is to ensure that in case the fiemap
+ * buffer is memory mapped to the fiemap target file, we don't deadlock
+ * during btrfs_page_mkwrite(). This is because during fiemap we are locking
+ * an extent range in order to prevent races with delalloc flushing and
+ * ordered extent completion, which is needed in order to reliably detect
+ * delalloc in holes and prealloc extents. And this can lead to a deadlock
+ * if the fiemap buffer is memory mapped to the file we are running fiemap
+ * against (a silly, useless in practice scenario, but possible) because
+ * btrfs_page_mkwrite() will try to lock the same extent range.
+ */
+struct fiemap_cache {
+ /* An array of ready fiemap entries. */
+ struct btrfs_fiemap_entry *entries;
+ /* Number of entries in the entries array. */
+ int entries_size;
+ /* Index of the next entry in the entries array to write to. */
+ int entries_pos;
+ /*
+ * Once the entries array is full, this indicates what's the offset for
+ * the next file extent item we must search for in the inode's subvolume
+ * tree after unlocking the extent range in the inode's io tree and
+ * releasing the search path.
+ */
+ u64 next_search_offset;
+ /*
+ * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
+ * to count ourselves emitted extents and stop instead of relying on
+ * fiemap_fill_next_extent() because we buffer ready fiemap entries at
+ * the @entries array, and we want to stop as soon as we hit the max
+ * amount of extents to map, not just to save time but also to make the
+ * logic at extent_fiemap() simpler.
+ */
+ unsigned int extents_mapped;
+ /* Fields for the cached extent (unsubmitted, not ready, extent). */
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ for (int i = 0; i < cache->entries_pos; i++) {
+ struct btrfs_fiemap_entry *entry = &cache->entries[i];
+ int ret;
+
+ ret = fiemap_fill_next_extent(fieinfo, entry->offset,
+ entry->phys, entry->len,
+ entry->flags);
+ /*
+ * Ignore 1 (reached max entries) because we keep track of that
+ * ourselves in emit_fiemap_extent().
+ */
+ if (ret < 0)
+ return ret;
+ }
+ cache->entries_pos = 0;
+
+ return 0;
+}
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ struct btrfs_fiemap_entry *entry;
+ u64 cache_end;
+
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * after we had to unlock the file range, release the search path, emit
+ * the fiemap extents stored in the buffer (cache->entries array) and
+ * the lock the remainder of the range and re-search the btree.
+ *
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
+ */
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ cache->flags == flags) {
+ cache->len += len;
+ return 0;
+ }
+
+emit:
+ /* Not mergeable, need to submit cached one */
+
+ if (cache->entries_pos == cache->entries_size) {
+ /*
+ * We will need to research for the end offset of the last
+ * stored extent and not from the current offset, because after
+ * unlocking the range and releasing the path, if there's a hole
+ * between that end offset and this current offset, a new extent
+ * may have been inserted due to a new write, so we don't want
+ * to miss it.
+ */
+ entry = &cache->entries[cache->entries_size - 1];
+ cache->next_search_offset = entry->offset + entry->len;
+ cache->cached = false;
+
+ return BTRFS_FIEMAP_FLUSH_CACHE;
+ }
+
+ entry = &cache->entries[cache->entries_pos];
+ entry->offset = cache->offset;
+ entry->phys = cache->phys;
+ entry->len = cache->len;
+ entry->flags = cache->flags;
+ cache->entries_pos++;
+ cache->extents_mapped++;
+
+ if (cache->extents_mapped == fieinfo->fi_extents_max) {
+ cache->cached = false;
+ return 1;
+ }
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+
+ return 0;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ struct extent_buffer *clone = path->nodes[0];
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
+
+ /*
+ * Add a temporary extra ref to an already cloned extent buffer to
+ * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
+ * the cost of allocating a new one.
+ */
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
+ atomic_inc(&clone->refs);
+
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ goto out;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Important to preserve the start field, for the optimizations when
+ * checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
+ */
+ clone->start = path->nodes[0]->start;
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ copy_extent_buffer_full(clone, path->nodes[0]);
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+out:
+ if (ret)
+ free_extent_buffer(clone);
+
+ return ret;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+ }
+
+ /*
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we check if extents are shared, as backref walking may need to
+ * lock the same leaf we are processing.
+ */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct extent_state **delalloc_cached_state,
+ struct btrfs_backref_share_check_ctx *backref_ctx,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
+
+ /*
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
+ */
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ /*
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
+ */
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
+ }
+
+ /*
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
+ */
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+static int extent_fiemap(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct extent_state *delalloc_cached_state = NULL;
+ struct btrfs_path *path;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_share_check_ctx *backref_ctx;
+ u64 last_extent_end = 0;
+ u64 prev_extent_end;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
+ bool stopped = false;
+ int ret;
+
+ cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
+ cache.entries = kmalloc_array(cache.entries_size,
+ sizeof(struct btrfs_fiemap_entry),
+ GFP_KERNEL);
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ path = btrfs_alloc_path();
+ if (!cache.entries || !backref_ctx || !path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+restart:
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
+
+ lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, range_start);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /*
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
+ */
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < range_end) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
+ */
+ if (extent_end <= range_start)
+ goto next_item;
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_end = min(key.offset, range_end) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ prev_extent_end, hole_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= range_end) {
+ stopped = true;
+ break;
+ }
+ }
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx,
+ disk_bytenr, extent_offset,
+ extent_gen, key.offset,
+ extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
+ }
+
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* emit_fiemap_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
+ }
+ cond_resched();
+ }
+
+check_eof_delalloc:
+ if (!stopped && prev_extent_end < range_end) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state, backref_ctx,
+ 0, 0, 0, prev_extent_end, range_end - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = range_end;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+out_unlock:
+ unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
+ btrfs_release_path(path);
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+ len -= cache.next_search_offset - start;
+ start = cache.next_search_offset;
+ goto restart;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ /*
+ * Must free the path before emitting to the fiemap buffer because we
+ * may have a non-cloned leaf and if the fiemap buffer is memory mapped
+ * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
+ * waiting for an ordered extent that in order to complete needs to
+ * modify that leaf, therefore leading to a deadlock.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+out:
+ free_extent_state(delalloc_cached_state);
+ kfree(cache.entries);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ int ret;
+
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
+}
diff --git a/fs/btrfs/fiemap.h b/fs/btrfs/fiemap.h
new file mode 100644
index 000000000000..cfd74b35988f
--- /dev/null
+++ b/fs/btrfs/fiemap.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FIEMAP_H
+#define BTRFS_FIEMAP_H
+
+#include <linux/fiemap.h>
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+#endif /* BTRFS_FIEMAP_H */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..886749b39672 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -10,17 +10,14 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "bio.h"
-#include "print-tree.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
-#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -48,18 +45,17 @@
*/
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ if (!inode->file_extent_tree) {
inode->disk_i_size = i_size;
goto out_unlock;
}
- ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
@@ -87,14 +83,15 @@ out_unlock:
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
- return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -115,15 +112,16 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
- return clear_extent_bit(&inode->file_extent_tree, start,
+ return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -153,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
* Calculate the total size needed to allocate for an ordered sum structure
* spanning @bytes in the file.
*/
-static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes)
{
return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
@@ -179,7 +177,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -356,7 +353,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
/*
@@ -434,8 +431,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset = bbio->file_offset + bio_offset;
set_extent_bit(&inode->io_tree, file_offset,
@@ -454,9 +450,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
return ret;
}
+/*
+ * Search for checksums for a given logical range.
+ *
+ * @root: The root where to look for checksums.
+ * @start: Logical address of target checksum range.
+ * @end: End offset (inclusive) of the target checksum range.
+ * @list: List for adding each checksum that was found.
+ * Can be NULL in case the caller only wants to check if
+ * there any checksums for the range.
+ * @nowait: Indicate if the search must be non-blocking or not.
+ *
+ * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were
+ * found.
+ */
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait)
+ struct list_head *list, bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -464,8 +473,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
- LIST_HEAD(tmplist);
int ret;
+ bool found_csums = false;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -475,11 +484,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
return -ENOMEM;
path->nowait = nowait;
- if (search_commit) {
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- }
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
@@ -487,7 +491,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
@@ -522,7 +526,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -544,6 +548,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
continue;
}
+ found_csums = true;
+ if (!list)
+ goto out;
+
csum_end = min(csum_end, end + 1);
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
@@ -557,7 +565,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
- goto fail;
+ goto out;
}
sums->logical = start;
@@ -571,21 +579,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
bytes_to_csum_size(fs_info, size));
start += size;
- list_add_tail(&sums->list, &tmplist);
+ list_add_tail(&sums->list, list);
}
path->slots[0]++;
}
- ret = 0;
-fail:
- while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
+out:
+ btrfs_free_path(path);
+ if (ret < 0) {
+ if (list) {
+ struct btrfs_ordered_sum *tmp_sums;
+
+ list_for_each_entry_safe(sums, tmp_sums, list, list)
+ kfree(sums);
+ }
+
+ return ret;
}
- list_splice_tail(&tmplist, list);
- btrfs_free_path(path);
- return ret;
+ return found_csums ? 1 : 0;
}
/*
@@ -874,8 +885,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -1175,7 +1186,7 @@ extend_csum:
* search, etc, because log trees are temporary anyway and it
* would only save a few bytes of leaf space.
*/
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
if (path->slots[0] + 1 >=
btrfs_header_nritems(path->nodes[0])) {
ret = find_next_csum_offset(root, path, &next_offset);
@@ -1229,8 +1240,6 @@ insert:
ins_size);
if (ret < 0)
goto out;
- if (WARN_ON(ret != 0))
- goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -1263,7 +1272,7 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
+ const struct btrfs_file_extent_item *fi,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1271,58 +1280,56 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_key key;
- u64 extent_start, extent_end;
- u64 bytenr;
+ u64 extent_start;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
- extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
em->start = extent_start;
- em->len = extent_end - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, fi);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
+ em->len = btrfs_file_extent_end(path) - extent_start;
+ if (disk_bytenr == 0) {
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->offset = 0;
return;
}
+ em->disk_bytenr = disk_bytenr;
+ em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
+ extent_map_set_compression(em, compress_type);
} else {
- bytenr += btrfs_file_extent_offset(leaf, fi);
- em->block_start = bytenr;
- em->block_len = em->len;
+ /*
+ * Older kernels can create regular non-hole data
+ * extents with ram_bytes smaller than disk_num_bytes.
+ * Not a big deal, just always use disk_num_bytes
+ * for ram_bytes.
+ */
+ em->ram_bytes = em->disk_num_bytes;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
- em->block_start = EXTENT_MAP_INLINE;
- em->start = extent_start;
- em->len = extent_end - extent_start;
- /*
- * Initialize orig_start and block_len with the same values
- * as in inode.c:btrfs_get_extent().
- */
- em->orig_start = EXTENT_MAP_HOLE;
- em->block_len = (u64)-1;
- em->compress_type = compress_type;
- if (compress_type != BTRFS_COMPRESS_NONE)
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ /* Tree-checker has ensured this. */
+ ASSERT(extent_start == 0);
+
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->start = 0;
+ em->len = fs_info->sectorsize;
+ em->offset = 0;
+ extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
"root %llu", type, btrfs_ino(inode), extent_start,
- root->root_key.objectid);
+ btrfs_root_id(root));
}
}
@@ -1343,12 +1350,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path)
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
- end = btrfs_file_extent_ram_bytes(leaf, fi);
- end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
- } else {
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE)
+ end = leaf->fs_info->sectorsize;
+ else
end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- }
return end;
}
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 04bd2d34efb1..0e13661a71f3 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,21 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
#include "accessors.h"
+struct extent_map;
+struct btrfs_file_extent_item;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_bio;
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_ordered_sum;
+struct btrfs_path;
+struct btrfs_inode;
+
#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
(offsetof(struct btrfs_file_extent_item, disk_bytenr))
@@ -55,14 +68,13 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait);
+ struct list_head *list, bool nowait);
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
u64 start, u64 end, u8 *csum_buf,
unsigned long *csum_bitmap);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
+ const struct btrfs_file_extent_item *fi,
struct extent_map *em);
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fc6c91773bc8..0e63603ac5c7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -18,13 +18,12 @@
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include "ctree.h"
+#include "direct-io.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
-#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
@@ -110,8 +109,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
- block_len);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+ block_start, block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -129,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int err = 0;
+ int ret = 0;
int i;
u64 num_bytes;
u64 start_pos;
@@ -159,17 +158,20 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
cached);
- err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
- if (err)
- return err;
+ if (ret)
+ return ret;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+ start_pos, num_bytes);
}
/*
@@ -204,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -241,10 +242,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (args->start >= inode->disk_i_size && !args->replace_extent)
+ if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
modify_tree = 0;
- update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -371,14 +372,17 @@ next_slot:
btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- new_key.objectid,
- args->start - extent_offset,
- 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, new_key.objectid,
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -461,14 +465,17 @@ delete_extent_item:
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_DROP_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key.objectid,
- key.offset - extent_offset, 0,
- false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key.objectid,
+ key.offset - extent_offset,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -744,10 +751,13 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(trans, leaf);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
- num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset, 0, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -770,10 +780,14 @@ again:
other_start = end;
other_end = 0;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
- 0, false);
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -866,9 +880,9 @@ static int prepare_uptodate_page(struct inode *inode,
* released.
*
* The private flag check is essential for subpage as we need
- * to store extra bitmap using page->private.
+ * to store extra bitmap using folio private.
*/
- if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page);
return -EAGAIN;
}
@@ -911,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
- int err = 0;
+ int ret = 0;
int faili;
for (i = 0; i < num_pages; i++) {
@@ -921,28 +935,28 @@ again:
if (!pages[i]) {
faili = i - 1;
if (nowait)
- err = -EAGAIN;
+ ret = -EAGAIN;
else
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail;
}
- err = set_page_extent_mapped(pages[i]);
- if (err < 0) {
+ ret = set_page_extent_mapped(pages[i]);
+ if (ret < 0) {
faili = i;
goto fail;
}
if (i == 0)
- err = prepare_uptodate_page(inode, pages[i], pos,
+ ret = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (!err && i == num_pages - 1)
- err = prepare_uptodate_page(inode, pages[i],
+ if (!ret && i == num_pages - 1)
+ ret = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
- if (err) {
+ if (ret) {
put_page(pages[i]);
- if (!nowait && err == -EAGAIN) {
- err = 0;
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
goto again;
}
faili = i - 1;
@@ -958,7 +972,7 @@ fail:
put_page(pages[faili]);
faili--;
}
- return err;
+ return ret;
}
@@ -1090,7 +1104,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
&cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, nowait, false);
+ NULL, nowait, false);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@@ -1108,33 +1122,32 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
static void update_time_for_write(struct inode *inode)
{
- struct timespec64 now, ctime;
+ struct timespec64 now, ts;
if (IS_NOCMTIME(inode))
return;
now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
+ inode_set_mtime_to_ts(inode, now);
- ctime = inode_get_ctime(inode);
- if (!timespec64_equal(&ctime, &now))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
inode_set_ctime_to_ts(inode, now);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
}
-static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
- size_t count)
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
- loff_t start_pos;
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
@@ -1158,9 +1171,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
*/
update_time_for_write(inode);
- start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
+ if (pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
@@ -1172,13 +1184,12 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
return 0;
}
-static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
- struct iov_iter *i)
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
loff_t pos;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1189,7 +1200,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
- loff_t old_isize = i_size_read(inode);
+ loff_t old_isize;
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@@ -1201,6 +1212,13 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (ret < 0)
return ret;
+ /*
+ * We can only trust the isize with inode lock held, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
+
ret = generic_write_checks(iocb, i);
if (ret <= 0)
goto out;
@@ -1436,202 +1454,6 @@ out:
return num_written ? num_written : ret;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- const u32 blocksize_mask = fs_info->sectorsize - 1;
-
- if (offset & blocksize_mask)
- return -EINVAL;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- return -EINVAL;
-
- return 0;
-}
-
-static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- loff_t pos;
- ssize_t written = 0;
- ssize_t written_buffered;
- size_t prev_left = 0;
- loff_t endbyte;
- ssize_t err;
- unsigned int ilock_flags = 0;
- struct iomap_dio *dio;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- ilock_flags |= BTRFS_ILOCK_TRY;
-
- /*
- * If the write DIO is within EOF, use a shared lock and also only if
- * security bits will likely not be dropped by file_remove_privs() called
- * from btrfs_write_check(). Either will need to be rechecked after the
- * lock was acquired.
- */
- if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
- ilock_flags |= BTRFS_ILOCK_SHARED;
-
-relock:
- err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (err < 0)
- return err;
-
- /* Shared lock cannot be used with security bits set. */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- return err;
- }
-
- err = btrfs_write_check(iocb, from, err);
- if (err < 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto out;
- }
-
- pos = iocb->ki_pos;
- /*
- * Re-check since file size may have changed just before taking the
- * lock or pos may have changed because of O_APPEND in generic_write_check()
- */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
- pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto buffered;
- }
-
- /*
- * The iov_iter can be mapped to the same file range we are writing to.
- * If that's the case, then we will deadlock in the iomap code, because
- * it first calls our callback btrfs_dio_iomap_begin(), which will create
- * an ordered extent, and after that it will fault in the pages that the
- * iov_iter refers to. During the fault in we end up in the readahead
- * pages code (starting at btrfs_readahead()), which will lock the range,
- * find that ordered extent and then wait for it to complete (at
- * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
- * obviously the ordered extent can never complete as we didn't submit
- * yet the respective bio(s). This always happens when the buffer is
- * memory mapped to the same file range, since the iomap DIO code always
- * invalidates pages in the target file range (after starting and waiting
- * for any writeback).
- *
- * So here we disable page faults in the iov_iter and then retry if we
- * got -EFAULT, faulting in the pages before the retry.
- */
-again:
- from->nofault = true;
- dio = btrfs_dio_write(iocb, from, written);
- from->nofault = false;
-
- if (IS_ERR_OR_NULL(dio)) {
- err = PTR_ERR_OR_ZERO(dio);
- } else {
- /*
- * If we have a synchoronous write, we must make sure the fsync
- * triggered by the iomap_dio_complete() call below doesn't
- * deadlock on the inode lock - we are already holding it and we
- * can't call it after unlocking because we may need to complete
- * partial writes due to the input buffer (or parts of it) not
- * being already faulted in.
- */
- ASSERT(current->journal_info == NULL);
- current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
- err = iomap_dio_complete(dio);
- current->journal_info = NULL;
- }
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (err > 0)
- written = err;
-
- if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
- const size_t left = iov_iter_count(from);
- /*
- * We have more data left to write. Try to fault in as many as
- * possible of the remainder pages and retry. We do this without
- * releasing and locking again the inode, to prevent races with
- * truncate.
- *
- * Also, in case the iov refers to pages in the file range of the
- * file we want to write to (due to a mmap), we could enter an
- * infinite loop if we retry after faulting the pages in, since
- * iomap will invalidate any pages in the range early on, before
- * it tries to fault in the pages of the iov. So we keep track of
- * how much was left of iov in the previous EFAULT and fallback
- * to buffered IO in case we haven't made any progress.
- */
- if (left == prev_left) {
- err = -ENOTBLK;
- } else {
- fault_in_iov_iter_readable(from, left);
- prev_left = left;
- goto again;
- }
- }
-
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
- /*
- * If 'err' is -ENOTBLK or we have not written all data, then it means
- * we must fallback to buffered IO.
- */
- if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
- goto out;
-
-buffered:
- /*
- * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
- * it must retry the operation in a context where blocking is acceptable,
- * because even if we end up not blocking during the buffered IO attempt
- * below, we will block when flushing and waiting for the IO.
- */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- err = -EAGAIN;
- goto out;
- }
-
- pos = iocb->ki_pos;
- written_buffered = btrfs_buffered_write(iocb, from);
- if (written_buffered < 0) {
- err = written_buffered;
- goto out;
- }
- /*
- * Ensure all data is persisted. We want the next direct IO read to be
- * able to read what was just written.
- */
- endbyte = pos + written_buffered - 1;
- err = btrfs_fdatawrite_range(inode, pos, endbyte);
- if (err)
- goto out;
- err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
- if (err)
- goto out;
- written += written_buffered;
- iocb->ki_pos = pos + written_buffered;
- invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
-out:
- return err < 0 ? err : written;
-}
-
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded)
{
@@ -1731,7 +1553,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
{
int ret;
struct blk_plug plug;
@@ -1751,10 +1573,10 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
{
- struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_inode *inode = ctx->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
list_empty(&ctx->ordered_extents))
return true;
@@ -1765,7 +1587,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
- if (inode->last_trans <= fs_info->last_trans_committed &&
+ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents)))
return true;
@@ -1787,9 +1609,9 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
@@ -1800,7 +1622,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
skip_ilock = true;
current->journal_info = NULL;
- lockdep_assert_held(&inode->i_rwsem);
+ btrfs_assert_inode_locked(inode);
}
trace_btrfs_sync_file(file, datasync);
@@ -1830,9 +1652,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
if (skip_ilock)
- down_write(&BTRFS_I(inode)->i_mmap_lock);
+ down_write(&inode->i_mmap_lock);
else
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1857,9 +1679,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = start_ordered_ops(inode, start, end);
if (ret) {
if (skip_ilock)
- up_write(&BTRFS_I(inode)->i_mmap_lock);
+ up_write(&inode->i_mmap_lock);
else
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1871,8 +1693,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* running delalloc the full sync flag may be set if we need to drop
* extra extent map ranges due to temporary memory allocation failures.
*/
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We have to do this here to avoid the priority inversion of waiting on
@@ -1891,15 +1712,29 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
* checksum lookups in the csum tree, and use instead the
* checksums attached to the ordered extents.
*/
- btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
- &ctx.ordered_extents);
- ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
+ if (ret)
+ goto out_release_extents;
+
+ /*
+ * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
+ * starting and waiting for writeback, because for buffered IO
+ * it may have been set during the end IO callback
+ * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
+ * case an error happened and we need to wait for ordered
+ * extents to complete so that any extent maps that point to
+ * unwritten locations are dropped and we don't log them.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
+ ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)
@@ -1907,15 +1742,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
- smp_mb();
if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
- clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
@@ -1923,10 +1756,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* for any errors that might have happened since we last
* checked called fsync.
*/
- ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
goto out_release_extents;
}
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
@@ -1946,6 +1781,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ /*
+ * Scratch eb no longer needed, release before syncing log or commit
+ * transaction, to avoid holding unnecessary memory during such long
+ * operations.
+ */
+ if (ctx.scratch_eb) {
+ free_extent_buffer(ctx.scratch_eb);
+ ctx.scratch_eb = NULL;
+ }
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
@@ -1963,9 +1807,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* inside btrfs_sync_log to keep things safe.
*/
if (skip_ilock)
- up_write(&BTRFS_I(inode)->i_mmap_lock);
+ up_write(&inode->i_mmap_lock);
else
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2024,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans);
out:
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
@@ -2034,12 +1879,179 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
if (skip_ilock)
- up_write(&BTRFS_I(inode)->i_mmap_lock);
+ up_write(&inode->i_mmap_lock);
else
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ unsigned long zero_start;
+ loff_t size;
+ size_t fsize = folio_size(folio);
+ vm_fault_t ret;
+ int ret2;
+ int reserved = 0;
+ u64 reserved_space;
+ u64 page_start;
+ u64 page_end;
+ u64 end;
+
+ ASSERT(folio_order(folio) == 0);
+
+ reserved_space = fsize;
+
+ sb_start_pagefault(inode->i_sb);
+ page_start = folio_pos(folio);
+ page_end = page_start + folio_size(folio) - 1;
+ end = page_end;
+
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepages() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
+ if (!ret2) {
+ ret2 = file_update_time(vmf->vma->vm_file);
+ reserved = 1;
+ }
+ if (ret2) {
+ ret = vmf_error(ret2);
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ /* Make the VM retry the fault. */
+ ret = VM_FAULT_NOPAGE;
+again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
+ folio_lock(folio);
+ size = i_size_read(inode);
+
+ if ((folio->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* Page got truncated out from underneath us. */
+ goto out_unlock;
+ }
+ folio_wait_writeback(folio);
+
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_folio_extent_mapped(folio);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
+
+ /*
+ * We can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ folio_unlock(folio);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start, fs_info->sectorsize);
+ if (reserved_space < fsize) {
+ end = page_start + reserved_space - 1;
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, end + 1,
+ fsize - reserved_space, true);
+ }
+ }
+
+ /*
+ * page_mkwrite gets called when the page is firstly dirtied after it's
+ * faulted in, but write(2) could also dirty a page and set delalloc
+ * bits, thus in this case for space account reason, we still need to
+ * clear any delalloc bits within this page range since we have to
+ * reserve data&meta space before lock_page() (see above comments).
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
+ &cached_state);
+ if (ret2) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ /* Page is wholly or partially inside EOF. */
+ if (page_start + folio_size(folio) > size)
+ zero_start = offset_in_folio(folio, size);
+ else
+ zero_start = fsize;
+
+ if (zero_start != fsize)
+ folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
+
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
+
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
+
+out_unlock:
+ folio_unlock(folio);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
+ reserved_space, (ret != 0));
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
@@ -2169,12 +2181,9 @@ out:
hole_em->start = offset;
hole_em->len = end - offset;
hole_em->ram_bytes = hole_em->len;
- hole_em->orig_start = offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2198,14 +2207,14 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0,
+ em = btrfs_get_extent(inode, NULL,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
- if (em->block_start == EXTENT_MAP_HOLE) {
+ if (em->disk_bytenr == EXTENT_MAP_HOLE) {
ret = 1;
*len = em->start + em->len > *start + *len ?
0 : *start + *len - em->start - em->len;
@@ -2227,15 +2236,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ break;
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2247,7 +2261,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* we do, unlock the range and retry.
*/
if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ page_lockend - 1))
break;
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -2270,7 +2284,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
int slot;
- struct btrfs_ref ref = { 0 };
int ret;
if (replace_len == 0)
@@ -2326,14 +2339,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->qgroup_reserved,
&key);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = extent_info->disk_offset,
+ .num_bytes = extent_info->disk_len,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
u64 ref_offset;
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- extent_info->disk_offset,
- extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset, 0, false);
+ btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2500,9 +2516,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
inode_inc_iversion(&inode->vfs_inode);
if (!extent_info || extent_info->update_times)
- inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
+ inode_set_mtime_to_ts(&inode->vfs_inode,
+ inode_set_ctime_current(&inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -2613,7 +2630,7 @@ out:
static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
@@ -2631,7 +2648,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
- ret = btrfs_wait_ordered_range(inode, offset, len);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
if (ret)
goto out_only_mutex;
@@ -2741,8 +2758,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -2761,14 +2778,14 @@ out_only_mutex:
struct timespec64 now = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
- inode->i_mtime = now;
+ inode_set_mtime_to_ts(inode, now);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
@@ -2835,7 +2852,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode_set_ctime_current(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
@@ -2855,13 +2872,13 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start == EXTENT_MAP_HOLE)
+ if (em->disk_bytenr == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
- else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2886,7 +2903,7 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -2901,8 +2918,7 @@ static int btrfs_zero_range(struct inode *inode,
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
- if (em->start <= alloc_start &&
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
@@ -2924,26 +2940,25 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = em->block_start + em->len;
+ alloc_hint = extent_map_block_start(em) + em->len;
}
free_extent_map(em);
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
- sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC) {
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
- if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
free_extent_map(em);
ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
0);
@@ -3026,7 +3041,7 @@ reserve_space:
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
- i_blocksize(inode),
+ fs_info->sectorsize,
offset + len, &alloc_hint);
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
@@ -3070,7 +3085,7 @@ static long btrfs_fallocate(struct file *file, int mode,
int ret;
/* Do not allow fallocate in ZONED mode */
- if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ if (btrfs_is_zoned(inode_to_fs_info(inode)))
return -EOPNOTSUPP;
alloc_start = round_down(offset, blocksize);
@@ -3128,7 +3143,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* the file range and, due to the previous locking we did, we know there
* can't be more delalloc or ordered extents in the range.
*/
- ret = btrfs_wait_ordered_range(inode, alloc_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
@@ -3147,7 +3162,7 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
while (cur_offset < alloc_end) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3156,9 +3171,9 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ !(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
@@ -3198,7 +3213,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
- range->len, i_blocksize(inode),
+ range->len, blocksize,
offset + len, &alloc_hint);
/*
* btrfs_prealloc_file_range() releases space even
@@ -3759,8 +3774,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_CAN_ODIRECT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3768,97 +3782,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
-static int check_direct_read(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int ret;
- int i, seg;
-
- ret = check_direct_IO(fs_info, iter, offset);
- if (ret < 0)
- return ret;
-
- if (!iter_is_iovec(iter))
- return 0;
-
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- const struct iovec *iov1 = iter_iov(iter) + seg;
- const struct iovec *iov2 = iter_iov(iter) + i;
-
- if (iov1->iov_base == iov2->iov_base)
- return -EINVAL;
- }
- }
- return 0;
-}
-
-static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- size_t prev_left = 0;
- ssize_t read = 0;
- ssize_t ret;
-
- if (fsverity_active(inode))
- return 0;
-
- if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
- return 0;
-
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-again:
- /*
- * This is similar to what we do for direct IO writes, see the comment
- * at btrfs_direct_write(), but we also disable page faults in addition
- * to disabling them only at the iov_iter level. This is because when
- * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
- * which can still trigger page fault ins despite having set ->nofault
- * to true of our 'to' iov_iter.
- *
- * The difference to direct IO writes is that we deadlock when trying
- * to lock the extent range in the inode's tree during he page reads
- * triggered by the fault in (while for writes it is due to waiting for
- * our own ordered extent). This is because for direct IO reads,
- * btrfs_dio_iomap_begin() returns with the extent range locked, which
- * is only unlocked in the endio callback (end_bio_extent_readpage()).
- */
- pagefault_disable();
- to->nofault = true;
- ret = btrfs_dio_read(iocb, to, read);
- to->nofault = false;
- pagefault_enable();
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (ret > 0)
- read = ret;
-
- if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
- const size_t left = iov_iter_count(to);
-
- if (left == prev_left) {
- /*
- * We didn't make any progress since the last attempt,
- * fallback to a buffered read for the remainder of the
- * range. This is just to avoid any possibility of looping
- * for too long.
- */
- ret = read;
- } else {
- /*
- * We made some progress since the last retry or this is
- * the first time we are retrying. Fault in as many pages
- * as possible and retry.
- */
- fault_in_iov_iter_writeable(to, left);
- prev_left = left;
- goto again;
- }
- }
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- return ret < 0 ? ret : read;
-}
-
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
@@ -3890,10 +3813,12 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
/*
@@ -3910,10 +3835,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
* know better and pull this out at some point in the future, it is
* right and you are wrong.
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
+ ret = filemap_fdatawrite_range(mapping, start, end);
return ret;
}
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 82b34fbb295f..912254e653cf 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -3,6 +3,21 @@
#ifndef BTRFS_FILE_H
#define BTRFS_FILE_H
+#include <linux/types.h>
+
+struct file;
+struct extent_state;
+struct kiocb;
+struct iov_iter;
+struct page;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_drop_extents_args;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_replace_extent_info;
+struct btrfs_trans_handle;
+
extern const struct file_operations btrfs_file_operations;
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
@@ -22,12 +37,14 @@ int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a6ec9344c3e..f4bcb2530660 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -19,9 +19,7 @@
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "volumes.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#include "subpage.h"
@@ -57,6 +55,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
@@ -79,7 +82,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
@@ -113,7 +115,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
+ inode = btrfs_iget_path(location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -135,7 +137,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
spin_lock(&block_group->lock);
if (block_group->inode)
- inode = igrab(block_group->inode);
+ inode = igrab(&block_group->inode->vfs_inode);
spin_unlock(&block_group->lock);
if (inode)
return inode;
@@ -154,7 +156,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
}
if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
- block_group->inode = igrab(inode);
+ block_group->inode = BTRFS_I(igrab(inode));
spin_unlock(&block_group->lock);
return inode;
@@ -354,7 +356,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
fail:
if (locked)
@@ -394,7 +396,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
return -ENOMEM;
io_ctl->num_pages = num_pages;
- io_ctl->fs_info = btrfs_sb(inode->i_sb);
+ io_ctl->fs_info = inode_to_fs_info(inode);
io_ctl->inode = inode;
return 0;
@@ -434,8 +436,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_page_clear_checked(io_ctl->fs_info,
- io_ctl->pages[i],
+ btrfs_folio_clear_checked(io_ctl->fs_info,
+ page_folio(io_ctl->pages[i]),
page_offset(io_ctl->pages[i]),
PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
@@ -540,7 +542,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
@@ -562,7 +564,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
val = *tmp;
io_ctl_map_page(io_ctl, 0);
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
@@ -1266,7 +1268,7 @@ static int flush_dirty_cache(struct inode *inode)
{
int ret;
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, NULL);
@@ -1322,7 +1324,7 @@ out:
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1363,7 +1365,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
/*
* Write out cached info to an inode.
*
- * @root: root the inode belongs to
* @inode: freespace inode we are writing out
* @ctl: free space cache we are going to write out
* @block_group: block_group for this cache if it belongs to a block_group
@@ -1374,7 +1375,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
* on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
@@ -1482,7 +1483,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl->entries = entries;
io_ctl->bitmaps = bitmaps;
- ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
goto out;
@@ -1507,7 +1508,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
@@ -1533,8 +1534,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
- block_group, &block_group->io_ctl, trans);
+ ret = __btrfs_write_out_cache(inode, ctl, block_group,
+ &block_group->io_ctl, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
@@ -2618,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+static int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
@@ -4157,15 +4158,13 @@ out:
int __init btrfs_free_space_init(void)
{
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
- sizeof(struct btrfs_free_space), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0);
if (!btrfs_free_space_cachep)
return -ENOMEM;
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_MEM_SPREAD, NULL);
+ 0, NULL);
if (!btrfs_free_space_bitmap_cachep) {
kmem_cache_destroy(btrfs_free_space_cachep);
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index bd80c7b2af96..9f1dbfdee8ca 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,7 +6,19 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/freezer.h>
+#include "fs.h"
+
+struct inode;
+struct page;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_trans_handle;
+struct btrfs_trim_block_group;
/*
* This is the trim state of an extent or bitmap.
@@ -121,8 +133,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
- u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a0d8160b5375..308abbf8855b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1104,11 +1104,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1138,8 +1148,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
ret = __add_to_free_space_tree(trans, block_group, path2,
@@ -1176,12 +1184,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
if (ret) {
btrfs_put_root(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
node = rb_first_cached(&fs_info->block_group_cache_tree);
@@ -1189,8 +1201,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
+ }
node = rb_next(node);
}
@@ -1206,11 +1221,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
+out_clear:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
return ret;
}
@@ -1273,12 +1286,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
btrfs_global_root_delete(free_space_root);
@@ -1299,11 +1318,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
}
return btrfs_commit_transaction(trans);
-
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1326,8 +1340,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
@@ -1336,8 +1353,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_next(node);
}
@@ -1348,10 +1368,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 6d5551d0ced8..e6c6d6f4f221 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -6,7 +6,13 @@
#ifndef BTRFS_FREE_SPACE_TREE_H
#define BTRFS_FREE_SPACE_TREE_H
+#include <linux/bits.h>
+
struct btrfs_caching_control;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
/*
* The default size for new free space bitmap items. The last bitmap in a block
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d24d41f7811a..374843aca60d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -4,14 +4,49 @@
#define BTRFS_FS_H
#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/btrfs_tree.h>
#include <linux/sizes.h>
+#include <linux/time64.h>
+#include <linux/compiler.h>
+#include <linux/math.h>
+#include <linux/atomic.h>
+#include <linux/percpu_counter.h>
+#include <linux/completion.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwlock_types.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/wait_bit.h>
+#include <linux/sched.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "extent-io-tree.h"
-#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+struct inode;
+struct super_block;
+struct kobject;
+struct reloc_control;
+struct crypto_shash;
+struct ulist;
+struct btrfs_device;
+struct btrfs_block_group;
+struct btrfs_root;
+struct btrfs_fs_devices;
+struct btrfs_transaction;
+struct btrfs_delayed_root;
+struct btrfs_balance_control;
+struct btrfs_subpage_info;
+struct btrfs_stripe_hash_table;
+struct btrfs_space_info;
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -63,7 +98,9 @@ enum {
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
- BTRFS_FS_STATE_NO_CSUMS,
+ /* Checksum errors are ignored. */
+ BTRFS_FS_STATE_NO_DATA_CSUMS,
+ BTRFS_FS_STATE_SKIP_META_CSUMS,
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
@@ -139,6 +176,12 @@ enum {
*/
BTRFS_FS_FEATURE_CHANGED,
+ /*
+ * Indicate that we have found a tree block which is only aligned to
+ * sectorsize, but not to nodesize. This should be rare nowadays.
+ */
+ BTRFS_FS_UNALIGNED_TREE_BLOCK,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -152,38 +195,39 @@ enum {
* Note: don't forget to add new options to btrfs_show_options()
*/
enum {
- BTRFS_MOUNT_NODATASUM = (1UL << 0),
- BTRFS_MOUNT_NODATACOW = (1UL << 1),
- BTRFS_MOUNT_NOBARRIER = (1UL << 2),
- BTRFS_MOUNT_SSD = (1UL << 3),
- BTRFS_MOUNT_DEGRADED = (1UL << 4),
- BTRFS_MOUNT_COMPRESS = (1UL << 5),
- BTRFS_MOUNT_NOTREELOG = (1UL << 6),
- BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
- BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
- BTRFS_MOUNT_NOSSD = (1UL << 9),
- BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
- BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
- BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
- BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
- BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
- BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
- BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
- BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
- BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
- BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
- BTRFS_MOUNT_NODISCARD = (1UL << 31),
+ BTRFS_MOUNT_NODATASUM = (1ULL << 0),
+ BTRFS_MOUNT_NODATACOW = (1ULL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1ULL << 2),
+ BTRFS_MOUNT_SSD = (1ULL << 3),
+ BTRFS_MOUNT_DEGRADED = (1ULL << 4),
+ BTRFS_MOUNT_COMPRESS = (1ULL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1ULL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1ULL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1ULL << 8),
+ BTRFS_MOUNT_NOSSD = (1ULL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1ULL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1ULL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1ULL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1ULL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1ULL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1ULL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1ULL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1ULL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1ULL << 18),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1ULL << 19),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1ULL << 20),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1ULL << 21),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1ULL << 22),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1ULL << 23),
+ BTRFS_MOUNT_NOLOGREPLAY = (1ULL << 24),
+ BTRFS_MOUNT_REF_VERIFY = (1ULL << 25),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1ULL << 26),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1ULL << 27),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1ULL << 28),
+ BTRFS_MOUNT_NODISCARD = (1ULL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30),
+ BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31),
+ BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
};
/*
@@ -216,7 +260,8 @@ enum {
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -225,6 +270,7 @@ enum {
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
#else
@@ -239,6 +285,7 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
struct btrfs_dev_replace {
@@ -371,6 +418,7 @@ struct btrfs_fs_info {
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
+ struct btrfs_root *stripe_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -393,7 +441,8 @@ struct btrfs_fs_info {
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
+ struct rb_root_cached mapping_tree;
+ rwlock_t mapping_tree_lock;
/*
* Block reservation for extent, checksum, root tree and delayed dir
@@ -411,7 +460,17 @@ struct btrfs_fs_info {
struct btrfs_block_rsv empty_block_rsv;
+ /*
+ * Updated while holding the lock 'trans_lock'. Due to the life cycle of
+ * a transaction, it can be directly read while holding a transaction
+ * handle, everywhere else must be read with btrfs_get_fs_generation().
+ * Should always be updated using btrfs_set_fs_generation().
+ */
u64 generation;
+ /*
+ * Always use btrfs_get_last_trans_committed() and
+ * btrfs_set_last_trans_committed() to read and update this field.
+ */
u64 last_trans_committed;
/*
* Generation of the last transaction used for block group relocation
@@ -425,7 +484,7 @@ struct btrfs_fs_info {
* required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- unsigned long mount_opt;
+ unsigned long long mount_opt;
unsigned long compress_type:4;
unsigned int compress_level;
@@ -576,6 +635,13 @@ struct btrfs_fs_info {
s32 dirty_metadata_batch;
s32 delalloc_batch;
+ struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
+ u64 extent_map_shrinker_last_root;
+ u64 extent_map_shrinker_last_ino;
+ atomic64_t extent_map_shrinker_nr_to_scan;
+ struct work_struct extent_map_shrinker_work;
+
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -642,14 +708,11 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
+ u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
- struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- u32 check_integrity_print_mask;
-#endif
/* Is qgroup tracking in a consistent state? */
u64 qgroup_flags;
@@ -685,6 +748,7 @@ struct btrfs_fs_info {
/* Protected by qgroup_rescan_lock */
bool qgroup_rescan_running;
u8 qgroup_drop_subtree_thres;
+ u64 qgroup_enable_gen;
/*
* If this is not 0, then it indicates a serious filesystem error has
@@ -717,10 +781,13 @@ struct btrfs_fs_info {
/* Reclaim partially filled block groups in the background */
struct work_struct reclaim_bgs_work;
+ /* Protected by unused_bgs_lock. */
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
+ /* Protects the lists unused_bgs and reclaim_bgs. */
spinlock_t unused_bgs_lock;
+ /* Protected by unused_bgs_lock. */
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
@@ -814,6 +881,37 @@ struct btrfs_fs_info {
#endif
};
+#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
+ struct page *: (_page))->mapping->host))
+#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
+ struct folio *: (_folio))->mapping->host))
+
+#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
+#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
+
+#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
+ struct inode *: (_inode)))->root->fs_info)
+
+static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->generation);
+}
+
+static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->generation, gen);
+}
+
+static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_trans_committed);
+}
+
+static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->last_trans_committed, gen);
+}
+
static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
u64 gen)
{
@@ -868,7 +966,7 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
/*
* Count how many fs_info->max_extent_size cover the @size
*/
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (!fs_info)
@@ -887,6 +985,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op);
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
+
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
@@ -927,21 +1027,7 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
@@ -960,7 +1046,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* since setting and checking for SB_RDONLY in the superblock's flags is not
* atomic.
*/
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
btrfs_fs_closing(fs_info);
@@ -981,7 +1067,7 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
}
@@ -992,7 +1078,7 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return 0;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index d3ff97374d48..29572dfaf878 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -9,13 +9,12 @@
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "space-info.h"
#include "accessors.h"
#include "extent-tree.h"
#include "file-item.h"
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name)
{
@@ -43,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
}
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
@@ -142,8 +141,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
if (!extref) {
- btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
- ret = -EROFS;
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -ENOENT;
goto out;
}
@@ -247,7 +246,7 @@ out:
}
/*
- * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ * Insert an extended inode ref into a tree.
*
* The caller must have checked against BTRFS_LINK_MAX already.
*/
@@ -424,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
return ret;
}
-static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
- struct extent_buffer *leaf,
- struct btrfs_file_extent_item *fi,
+static inline void btrfs_trace_truncate(const struct btrfs_inode *inode,
+ const struct extent_buffer *leaf,
+ const struct btrfs_file_extent_item *fi,
u64 offset, int extent_type, int slot)
{
if (!inode)
@@ -671,15 +670,18 @@ delete:
}
if (del_item && extent_start != 0 && !control->skip_ref_updates) {
- struct btrfs_ref ref = { 0 };
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = extent_start,
+ .num_bytes = extent_num_bytes,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_header_owner(leaf),
+ };
bytes_deleted += extent_num_bytes;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- control->ino, extent_offset,
- root->root_key.objectid, false);
+ btrfs_init_data_ref(&ref, control->ino, extent_offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index ede43b6c6559..c11b97fdccc4 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -4,14 +4,17 @@
#define BTRFS_INODE_ITEM_H
#include <linux/types.h>
+#include <linux/crc32c.h>
+struct fscrypt_str;
+struct extent_buffer;
struct btrfs_trans_handle;
struct btrfs_root;
struct btrfs_path;
struct btrfs_key;
struct btrfs_inode_extref;
struct btrfs_inode;
-struct extent_buffer;
+struct btrfs_truncate_control;
/*
* Return this if we need to call truncate_block for the last bit of the
@@ -76,6 +79,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
*ro_flags = (u32)(inode_item_flags >> 32);
}
+/* Figure the key offset of an extended inode ref. */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len)
+{
+ return (u64)crc32c(parent_objectid, name, len);
+}
+
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
@@ -100,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref(
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 035815c43949..f84e3f9fad84 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,21 +32,19 @@
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/iomap.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
-#include "free-space-cache.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -71,31 +69,14 @@
#include "super.h"
#include "orphan.h"
#include "backref.h"
+#include "raid-stripe-tree.h"
+#include "fiemap.h"
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};
-struct btrfs_dio_data {
- ssize_t submitted;
- struct extent_changeset *data_reserved;
- struct btrfs_ordered_extent *ordered;
- bool data_space_reserved;
- bool nocow_done;
-};
-
-struct btrfs_dio_private {
- /* Range of I/O */
- u64 file_offset;
- u32 bytes;
-
- /* This must be last */
- struct btrfs_bio bbio;
-};
-
-static struct bio_set btrfs_dio_bioset;
-
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -113,6 +94,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -126,14 +116,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type);
static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
@@ -246,7 +231,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
@@ -256,7 +241,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -323,15 +308,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -339,7 +324,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
} else {
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -348,7 +333,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
}
/*
- * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ * Lock inode i_rwsem based on arguments passed.
*
* ilock_flags can have the following bit set:
*
@@ -382,7 +367,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * btrfs_inode_unlock - unock inode i_rwsem
+ * Unock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -408,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
u64 page_start = 0, page_end = 0;
- struct page *page;
+ struct folio *folio;
- if (locked_page) {
- page_start = page_offset(locked_page);
- page_end = page_start + PAGE_SIZE - 1;
+ if (locked_folio) {
+ page_start = folio_pos(locked_folio);
+ page_end = page_start + folio_size(locked_folio) - 1;
}
while (index <= end_index) {
@@ -432,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (locked_page && index == (page_start >> PAGE_SHIFT)) {
+ if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
- page = find_get_page(inode->vfs_inode.i_mapping, index);
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
index++;
- if (!page)
+ if (IS_ERR(folio))
continue;
/*
@@ -446,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
- put_page(page);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
+ offset, bytes);
+ folio_put(folio);
}
- if (locked_page) {
+ if (locked_folio) {
/* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_start + PAGE_SIZE)
+ if (bytes + offset <= page_start + folio_size(locked_folio))
return;
/*
* In case this page belongs to the delalloc range being
@@ -462,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ bytes = offset + bytes - folio_pos(locked_folio) -
+ folio_size(locked_folio);
+ offset = folio_pos(locked_folio) + folio_size(locked_folio);
}
}
@@ -504,12 +490,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages,
+ struct folio *compressed_folio,
bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct page *page = NULL;
+ const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
@@ -517,10 +503,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
u64 i_size;
- ASSERT((compressed_size > 0 && compressed_pages) ||
- (compressed_size == 0 && !compressed_pages));
+ /*
+ * The decompressed size must still be no larger than a sector. Under
+ * heavy race, we can have size == 0 passed in, but that shouldn't be a
+ * big deal and we can continue the insertion.
+ */
+ ASSERT(size <= sectorsize);
+
+ /*
+ * The compressed size also needs to be no larger than a sector.
+ * That's also why we only need one page as the parameter.
+ */
+ if (compressed_folio)
+ ASSERT(compressed_size <= sectorsize);
+ else
+ ASSERT(compressed_size == 0);
- if (compressed_size && compressed_pages)
+ if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) {
@@ -548,30 +547,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
- struct page *cpage;
- int i = 0;
- while (compressed_size > 0) {
- cpage = compressed_pages[i];
- cur_size = min_t(unsigned long, compressed_size,
- PAGE_SIZE);
-
- kaddr = kmap_local_page(cpage);
- write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(compressed_folio, 0);
+ write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+ kunmap_local(kaddr);
- i++;
- ptr += cur_size;
- compressed_size -= cur_size;
- }
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->vfs_inode.i_mapping, 0);
+ struct folio *folio;
+
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
+ 0, 0, 0);
+ ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_local(kaddr);
- put_page(page);
+ folio_put(folio);
}
btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -603,17 +595,62 @@ fail:
return ret;
}
+static bool can_cow_file_range_inline(struct btrfs_inode *inode,
+ u64 offset, u64 size,
+ size_t compressed_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 data_len = (compressed_size ?: size);
+
+ /* Inline extents must start at offset 0. */
+ if (offset != 0)
+ return false;
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (fs_info->sectorsize != PAGE_SIZE)
+ return false;
+
+ /* Inline extents are limited to sectorsize. */
+ if (size > fs_info->sectorsize)
+ return false;
+
+ /* We cannot exceed the maximum inline data size. */
+ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return false;
+
+ /* We cannot exceed the user specified max_inline size. */
+ if (data_len > fs_info->max_inline)
+ return false;
+
+ /* Inline extents must be the entirety of the file. */
+ if (size < i_size_read(&inode->vfs_inode))
+ return false;
+
+ return true;
+}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
+ *
+ * If being used directly, you must have already checked we're allowed to cow
+ * the range by getting true from can_cow_file_range_inline().
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
- size_t compressed_size,
- int compress_type,
- struct page **compressed_pages,
- bool update_i_size)
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 size, size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
@@ -623,18 +660,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
int ret;
struct btrfs_path *path;
- /*
- * We can create an inline extent if it ends at or beyond the current
- * i_size, is no larger than a sector (decompressed), and the (possibly
- * compressed) data fits in a leaf and the configured maximum inline
- * size.
- */
- if (size < i_size_read(&inode->vfs_inode) ||
- size > fs_info->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- data_len > fs_info->max_inline)
- return 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -660,7 +685,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
- compressed_pages, update_i_size);
+ compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -670,7 +695,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
}
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -693,19 +718,68 @@ out:
return ret;
}
+static noinline int cow_file_range_inline(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 offset, u64 end,
+ size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
+{
+ struct extent_state *cached = NULL;
+ unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
+ u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
+ int ret;
+
+ if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
+ return 1;
+
+ lock_extent(&inode->io_tree, offset, end, &cached);
+ ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ compress_type, compressed_folio,
+ update_i_size);
+ if (ret > 0) {
+ unlock_extent(&inode->io_tree, offset, end, &cached);
+ return ret;
+ }
+
+ /*
+ * In the successful case (ret == 0 here), cow_file_range will return 1.
+ *
+ * Quite a bit further up the callstack in extent_writepage(), ret == 1
+ * is treated as a short circuited success and does not unlock the folio,
+ * so we must do it here.
+ *
+ * In the failure case, the locked_folio does get unlocked by
+ * btrfs_folio_end_all_writers, which asserts that it is still locked
+ * at that point, so we must *not* unlock it here.
+ *
+ * The other two callsites in compress_file_range do not have a
+ * locked_folio, so they are not relevant to this logic.
+ */
+ if (ret == 0)
+ locked_folio = NULL;
+
+ extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
+ clear_flags, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ return ret;
+}
+
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
int compress_type;
struct list_head list;
};
struct async_chunk {
struct btrfs_inode *inode;
- struct page *locked_page;
+ struct folio *locked_folio;
u64 start;
u64 end;
blk_opf_t write_flags;
@@ -723,8 +797,8 @@ struct async_cow {
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
- struct page **pages,
- unsigned long nr_pages,
+ struct folio **folios,
+ unsigned long nr_folios,
int compress_type)
{
struct async_extent *async_extent;
@@ -735,8 +809,8 @@ static noinline int add_async_extent(struct async_chunk *cow,
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
- async_extent->pages = pages;
- async_extent->nr_pages = nr_pages;
+ async_extent->folios = folios;
+ async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
@@ -801,7 +875,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress)
- return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
+ return btrfs_compress_heuristic(inode, start, end);
return 0;
}
@@ -811,7 +885,27 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode, small_write);
+ btrfs_add_inode_defrag(inode, small_write);
+}
+
+static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct folio *folio;
+ int ret = 0;
+
+ for (unsigned long index = start >> PAGE_SHIFT;
+ index <= end_index; index++) {
+ folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+ if (IS_ERR(folio)) {
+ if (!ret)
+ ret = PTR_ERR(folio);
+ continue;
+ }
+ folio_clear_dirty_for_io(folio);
+ folio_put(folio);
+ }
+ return ret;
}
/*
@@ -840,8 +934,8 @@ static void compress_file_range(struct btrfs_work *work)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
@@ -855,7 +949,16 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+
+ /*
+ * All the folios should have been locked thus no failure.
+ *
+ * And even if some folios are missing, btrfs_compress_folios()
+ * would handle them correctly, so here just do an ASSERT() check for
+ * early logic errors.
+ */
+ ASSERT(ret == 0);
/*
* We need to save i_size before now because it could change in between
@@ -871,9 +974,9 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- pages = NULL;
- nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ folios = NULL;
+ nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -922,8 +1025,8 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
+ folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!folios) {
/*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
@@ -937,9 +1040,9 @@ again:
compress_type = inode->prop_compress;
/* Compression level is applied here. */
- ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
- mapping, start, pages, &nr_pages, &total_in,
- &total_compressed);
+ ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ mapping, start, folios, &nr_folios, &total_in,
+ &total_compressed);
if (ret)
goto mark_incompressible;
@@ -949,7 +1052,7 @@ again:
*/
poff = offset_in_page(total_compressed);
if (poff)
- memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+ folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/*
* Try to create an inline extent.
@@ -960,43 +1063,16 @@ again:
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- if (total_in < actual_end) {
- ret = cow_file_range_inline(inode, actual_end, 0,
- BTRFS_COMPRESS_NONE, NULL,
- false);
- } else {
- ret = cow_file_range_inline(inode, actual_end,
- total_compressed,
- compress_type, pages,
- false);
- }
- if (ret <= 0) {
- unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
-
- if (ret < 0)
- mapping_set_error(mapping, -EIO);
-
- /*
- * inline extent creation worked or returned error,
- * we don't need to create any more async work items.
- * Unlock and free up our temp pages.
- *
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be done _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- NULL,
- clear_flags,
- PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- goto free_pages;
- }
+ if (total_in < actual_end)
+ ret = cow_file_range_inline(inode, NULL, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ else
+ ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
+ compress_type, folios[0], false);
+ if (ret <= 0) {
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
+ goto free_pages;
}
/*
@@ -1018,8 +1094,8 @@ again:
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
- nr_pages, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+ nr_folios, compress_type);
BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
@@ -1036,12 +1112,12 @@ cleanup_and_bail_uncompressed:
BTRFS_COMPRESS_NONE);
BUG_ON(ret);
free_pages:
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ if (folios) {
+ for (i = 0; i < nr_folios; i++) {
+ WARN_ON(folios[i]->mapping);
+ btrfs_free_compr_folio(folios[i]);
}
- kfree(pages);
+ kfree(folios);
}
}
@@ -1049,21 +1125,21 @@ static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
- if (!async_extent->pages)
+ if (!async_extent->folios)
return;
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ for (i = 0; i < async_extent->nr_folios; i++) {
+ WARN_ON(async_extent->folios[i]->mapping);
+ btrfs_free_compr_folio(async_extent->folios[i]);
}
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ kfree(async_extent->folios);
+ async_extent->nr_folios = 0;
+ async_extent->folios = NULL;
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
struct async_extent *async_extent,
- struct page *locked_page)
+ struct folio *locked_folio)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1076,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
};
wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
- ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
+ ret = run_delalloc_cow(inode, locked_folio, start, end,
+ &wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
- if (locked_page) {
- const u64 page_start = page_offset(locked_page);
-
- set_page_writeback(locked_page);
- end_page_writeback(locked_page);
- btrfs_mark_ordered_io_finished(inode, locked_page,
+ btrfs_cleanup_ordered_extents(inode, locked_folio,
+ start, end - start + 1);
+ if (locked_folio) {
+ const u64 page_start = folio_pos(locked_folio);
+
+ folio_start_writeback(locked_folio);
+ folio_end_writeback(locked_folio);
+ btrfs_mark_ordered_io_finished(inode, locked_folio,
page_start, PAGE_SIZE,
!ret);
- mapping_set_error(locked_page->mapping, ret);
- unlock_page(locked_page);
+ mapping_set_error(locked_folio->mapping, ret);
+ folio_unlock(locked_folio);
}
}
}
@@ -1103,10 +1181,13 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
struct btrfs_key ins;
- struct page *locked_page = NULL;
+ struct folio *locked_folio = NULL;
+ struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1114,20 +1195,23 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
kthread_associate_blkcg(async_chunk->blkcg_css);
/*
- * If async_chunk->locked_page is in the async_extent range, we need to
+ * If async_chunk->locked_folio is in the async_extent range, we need to
* handle it.
*/
- if (async_chunk->locked_page) {
- u64 locked_page_start = page_offset(async_chunk->locked_page);
- u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
+ if (async_chunk->locked_folio) {
+ u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
+ u64 locked_folio_end = locked_folio_start +
+ folio_size(async_chunk->locked_folio) - 1;
- if (!(start >= locked_page_end || end <= locked_page_start))
- locked_page = async_chunk->locked_page;
+ if (!(start >= locked_folio_end || end <= locked_folio_start))
+ locked_folio = async_chunk->locked_folio;
}
- lock_extent(io_tree, start, end, NULL);
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
- submit_uncompressed_range(inode, async_extent, locked_page);
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1142,34 +1226,30 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* non-contiguous space for the uncompressed size instead. So
* fall back to uncompressed.
*/
- submit_uncompressed_range(inode, async_extent, locked_page);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
+ lock_extent(io_tree, start, end, &cached);
+
/* Here we're doing allocation and writeback of the compressed pages */
- em = create_io_em(inode, start,
- async_extent->ram_size, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.ram_bytes = async_extent->ram_size;
+ file_extent.num_bytes = async_extent->ram_size;
+ file_extent.offset = 0;
+ file_extent.compression = async_extent->compress_type;
+
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserve;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */
- async_extent->ram_size, /* num_bytes */
- async_extent->ram_size, /* ram_bytes */
- ins.objectid, /* disk_bytenr */
- ins.offset, /* disk_num_bytes */
- 0, /* offset */
- 1 << BTRFS_ORDERED_COMPRESSED,
- async_extent->compress_type);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1179,16 +1259,18 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
- async_extent->pages, /* compressed_pages */
- async_extent->nr_pages,
+ async_extent->folios, /* compressed_folios */
+ async_extent->nr_folios,
async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
@@ -1197,7 +1279,8 @@ out_free_reserve:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ NULL, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
@@ -1207,13 +1290,13 @@ out_free_reserve:
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- root->root_key.objectid, btrfs_ino(inode), start,
+ btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
kfree(async_extent);
}
-static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
- u64 num_bytes)
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1227,15 +1310,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* first block in this inode and use that as a hint. If that
* block is also bogus then just don't worry about it.
*/
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
free_extent_map(em);
em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
+ if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = extent_map_block_start(em);
if (em)
free_extent_map(em);
} else {
- alloc_hint = em->block_start;
+ alloc_hint = extent_map_block_start(em);
free_extent_map(em);
}
}
@@ -1250,21 +1333,21 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
- * locked_page is the page that writepage had locked already. We use
+ * locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_page.
+ * When this function fails, it unlocks all pages except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_page and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_page is
+ * unlocks all pages including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single page, so locked_folio is
* the only page handled anyway).
*
* When this function succeed and creates a normal extent, the page locking
* status depends on the passed in flags:
*
* - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_page are unlocked.
+ * - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are kept
@@ -1273,12 +1356,13 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
- struct page *locked_page, u64 start, u64 end,
- u64 *done_offset,
+ struct folio *locked_folio, u64 start,
+ u64 end, u64 *done_offset,
bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
@@ -1304,57 +1388,36 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
- u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
- end + 1);
-
+ if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, actual_end, 0,
+ ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
- if (ret == 0) {
- /*
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be run _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ if (ret <= 0) {
/*
- * locked_page is locked by the caller of
- * writepage_delalloc(), not locked by
- * __process_pages_contig().
+ * We succeeded, return 1 so the caller knows we're done
+ * with this page and already handled the IO.
*
- * We can't let __process_pages_contig() to unlock it,
- * as it doesn't have any subpage::writers recorded.
- *
- * Here we manually unlock the page, since the caller
- * can't determine if it's an inline extent or a
- * compressed extent.
+ * If there was an error then cow_file_range_inline() has
+ * already done the cleanup.
*/
- unlock_page(locked_page);
- ret = 1;
+ if (ret == 0)
+ ret = 1;
goto done;
- } else if (ret < 0) {
- goto out_unlock;
}
}
- alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
+
+ /*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1374,6 +1437,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
while (num_bytes > 0) {
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
@@ -1399,8 +1463,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1410,25 +1479,35 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
- em = create_io_em(inode, start, ins.offset, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- ram_size, /* ram_bytes */
- BTRFS_COMPRESS_NONE, /* compress_type */
- BTRFS_ORDERED_REGULAR /* type */);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
+ lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ &cached);
+
+ em = btrfs_create_io_em(inode, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
- ram_size, ins.objectid, cur_alloc_size,
- 0, 1 << BTRFS_ORDERED_REGULAR,
- BTRFS_COMPRESS_NONE);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1456,27 +1535,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered (Private2) bit so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
- if (num_bytes < cur_alloc_size)
+ if (num_bytes < ram_size)
num_bytes = 0;
else
- num_bytes -= cur_alloc_size;
+ num_bytes -= ram_size;
alloc_hint = ins.objectid + ins.offset;
- start += cur_alloc_size;
+ start += ram_size;
extent_reserved = false;
/*
@@ -1487,6 +1551,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
@@ -1502,34 +1568,36 @@ out_unlock:
* Now, we have three regions to clean up:
*
* |-------(1)----|---(2)---|-------------(3)----------|
- * `- orig_start `- start `- start + cur_alloc_size `- end
+ * `- orig_start `- start `- start + ram_size `- end
*
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
* for this region. They are cleaned up by
* btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * btrfs_run_delalloc_range().
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the cleanup function.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_page) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
- if (!locked_page)
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
/*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
@@ -1542,11 +1610,11 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size - 1,
- locked_page,
- clear_bits,
+ start + ram_size - 1,
+ locked_folio, &cached, clear_bits,
page_ops);
- start += cur_alloc_size;
+ btrfs_qgroup_free_data(inode, NULL, start, ram_size, NULL);
+ start += ram_size;
}
/*
@@ -1557,8 +1625,9 @@ out_unlock:
*/
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
+ extent_clear_unlock_delalloc(inode, start, end, locked_folio,
+ &cached, clear_bits, page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
}
return ret;
}
@@ -1567,8 +1636,11 @@ out_unlock:
* Phase two of compressed writeback. This is the ordered portion of the code,
* which only gets called in the order the work was queued. We walk all the
* async extents created by compress_file_range and send them down to the disk.
+ *
+ * If called with @do_free == true then it'll try to finish the work and free
+ * the work struct eventually.
*/
-static noinline void submit_compressed_extents(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work);
@@ -1577,6 +1649,19 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
unsigned long nr_pages;
u64 alloc_hint = 0;
+ if (do_free) {
+ struct async_cow *async_cow;
+
+ btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
+ return;
+ }
+
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -1593,23 +1678,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
cond_wake_up_nomb(&fs_info->async_submit_wait);
}
-static noinline void async_cow_free(struct btrfs_work *work)
-{
- struct async_chunk *async_chunk;
- struct async_cow *async_cow;
-
- async_chunk = container_of(work, struct async_chunk, work);
- btrfs_add_delayed_iput(async_chunk->inode);
- if (async_chunk->blkcg_css)
- css_put(async_chunk->blkcg_css);
-
- async_cow = async_chunk->async_cow;
- if (atomic_dec_and_test(&async_cow->num_chunks))
- kvfree(async_cow);
-}
-
static bool run_delalloc_compressed(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1628,7 +1698,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
if (!ctx)
return false;
- unlock_extent(&inode->io_tree, start, end, NULL);
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
@@ -1650,15 +1719,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
INIT_LIST_HEAD(&async_chunk[i].extents);
/*
- * The locked_page comes all the way from writepage and its
- * the original page we were actually given. As we spread
+ * The locked_folio comes all the way from writepage and its
+ * the original folio we were actually given. As we spread
* this large delalloc region across multiple async_chunk
- * structs, only the first struct needs a pointer to locked_page
+ * structs, only the first struct needs a pointer to
+ * locked_folio.
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
- if (locked_page) {
+ if (locked_folio) {
/*
* Depending on the compressibility, the pages might or
* might not go through async. We want all of them to
@@ -1668,12 +1738,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, locked_page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
- async_chunk[i].locked_page = locked_page;
- locked_page = NULL;
+ async_chunk[i].locked_folio = locked_folio;
+ locked_folio = NULL;
} else {
- async_chunk[i].locked_page = NULL;
+ async_chunk[i].locked_folio = NULL;
}
if (blkcg_css != blkcg_root_css) {
@@ -1685,7 +1755,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
}
btrfs_init_work(&async_chunk[i].work, compress_file_range,
- submit_compressed_extents, async_cow_free);
+ submit_compressed_extents);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -1702,7 +1772,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* covered by the range.
*/
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -1710,48 +1780,27 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
int ret;
while (start <= end) {
- ret = cow_file_range(inode, locked_page, start, end, &done_offset,
- true, false);
+ ret = cow_file_range(inode, locked_folio, start, end,
+ &done_offset, true, false);
if (ret)
return ret;
- extent_write_locked_range(&inode->vfs_inode, locked_page, start,
- done_offset, wbc, pages_dirty);
+ extent_write_locked_range(&inode->vfs_inode, locked_folio,
+ start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
return 1;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes, bool nowait)
-{
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
- struct btrfs_ordered_sum *sums;
- int ret;
- LIST_HEAD(list);
-
- ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
- &list, 0, nowait);
- if (ret == 0 && list_empty(&list))
- return 0;
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret < 0)
- return ret;
- return 1;
-}
-
-static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
- const u64 start, const u64 end)
+static int fallback_to_cow(struct btrfs_inode *inode,
+ struct folio *locked_folio, const u64 start,
+ const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;
@@ -1788,6 +1837,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
+ lock_extent(io_tree, start, end, &cached_state);
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
@@ -1806,13 +1856,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
NULL);
}
+ unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
*/
- ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
+ true);
ASSERT(ret != 1);
return ret;
}
@@ -1832,13 +1884,11 @@ struct can_nocow_file_extent_args {
*/
bool free_path;
- /* Output fields. Only set when can_nocow_file_extent() returns 1. */
-
- u64 disk_bytenr;
- u64 disk_num_bytes;
- u64 extent_offset;
- /* Number of bytes that can be written to in NOCOW mode. */
- u64 num_bytes;
+ /*
+ * Output fields. Only set when can_nocow_file_extent() returns 1.
+ * The expected file extent for the NOCOW write.
+ */
+ struct btrfs_file_extent file_extent;
};
/*
@@ -1859,6 +1909,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
+ struct btrfs_root *csum_root;
+ u64 io_start;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1871,11 +1923,6 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
goto out;
- /* Can't access these fields unless we know it's not an inline extent. */
- args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- args->extent_offset = btrfs_file_extent_offset(leaf, fi);
-
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG)
goto out;
@@ -1891,7 +1938,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
goto out;
/* An explicit hole, must COW. */
- if (args->disk_bytenr == 0)
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
goto out;
/* Compressed/encrypted/encoded extents must be COWed. */
@@ -1902,6 +1949,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,
extent_end = btrfs_file_extent_end(path);
+ args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
+ args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
+
/*
* The following checks can be expensive, as they need to take other
* locks and do btree or rbtree searches, so release the path to avoid
@@ -1910,8 +1963,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->extent_offset,
- args->disk_bytenr, args->strict, path);
+ key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, args->strict, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1919,7 +1972,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (args->free_path) {
/*
* We don't need the path anymore, plus through the
- * csum_exist_in_range() call below we will end up allocating
+ * btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
@@ -1932,16 +1985,19 @@ static int can_nocow_file_extent(struct btrfs_path *path,
atomic_read(&root->snapshot_force_cow))
goto out;
- args->disk_bytenr += args->extent_offset;
- args->disk_bytenr += args->start - key->offset;
- args->num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.offset += args->start - key->offset;
+ io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
/*
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
- nowait);
+
+ csum_root = btrfs_csum_root(root->fs_info, io_start);
+ ret = btrfs_lookup_csums_list(csum_root, io_start,
+ io_start + args->file_extent.num_bytes - 1,
+ NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1955,6 +2011,110 @@ static int can_nocow_file_extent(struct btrfs_path *path,
}
/*
+ * Cleanup the dirty folios which will never be submitted due to error.
+ *
+ * When running a delalloc range, we may need to split the ranges (due to
+ * fragmentation or NOCOW). If we hit an error in the later part, we will error
+ * out and previously successfully executed range will never be submitted, thus
+ * we have to cleanup those folios by clearing their dirty flag, starting and
+ * finishing the writeback.
+ */
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error handler,
+ * as metadata for created ordered extent will only be freed by
+ * btrfs_finish_ordered_io().
+ */
+ return ret;
+}
+
+/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
@@ -1962,13 +2122,18 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
+ /*
+ * If not 0, represents the inclusive end of the last fallback_to_cow()
+ * range. Only for error handling.
+ */
+ u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
@@ -1991,17 +2156,14 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.end = end;
nocow_args.writeback_path = true;
- while (1) {
+ while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 ram_bytes;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2056,12 +2218,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2077,7 +2240,6 @@ next_slot:
ret = -EUCLEAN;
goto error;
}
- ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = btrfs_file_extent_end(path);
/*
@@ -2097,7 +2259,9 @@ next_slot:
goto must_cow;
ret = 0;
- nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ nocow_bg = btrfs_inc_nocow_writers(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset);
if (!nocow_bg) {
must_cow:
/*
@@ -2124,79 +2288,23 @@ must_cow:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page,
- cow_start, found_key.offset - 1);
+ ret = fallback_to_cow(inode, locked_folio, cow_start,
+ found_key.offset - 1);
cow_start = (u64)-1;
if (ret) {
+ cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
}
- nocow_end = cur_offset + nocow_args.num_bytes - 1;
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- u64 orig_start = found_key.offset - nocow_args.extent_offset;
- struct extent_map *em;
-
- em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
- orig_start,
- nocow_args.disk_bytenr, /* block_start */
- nocow_args.num_bytes, /* block_len */
- nocow_args.disk_num_bytes, /* orig_block_len */
- ram_bytes, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- nocow_args.num_bytes, nocow_args.num_bytes,
- nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW),
- BTRFS_COMPRESS_NONE);
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
- if (cur_offset > end)
- break;
}
btrfs_release_path(path);
@@ -2204,11 +2312,12 @@ must_cow:
cow_start = cur_offset;
if (cow_start != (u64)-1) {
- cur_offset = end;
- ret = fallback_to_cow(inode, locked_page, cow_start, end);
+ ret = fallback_to_cow(inode, locked_folio, cow_start, end);
cow_start = (u64)-1;
- if (ret)
+ if (ret) {
+ cow_end = end;
goto error;
+ }
}
btrfs_free_path(path);
@@ -2216,19 +2325,59 @@ must_cow:
error:
/*
+ * There are several error cases:
+ *
+ * 1) Failed without falling back to COW
+ * start cur_offset end
+ * |/////////////| |
+ *
+ * For range [start, cur_offset) the folios are already unlocked (except
+ * @locked_folio), EXTENT_DELALLOC already removed.
+ * Only need to clear the dirty flag as they will never be submitted.
+ * Ordered extent and extent maps are handled by
+ * btrfs_mark_ordered_io_finished() inside run_delalloc_range().
+ *
+ * 2) Failed with error from fallback_to_cow()
+ * start cur_offset cow_end end
+ * |/////////////|-----------| |
+ *
+ * For range [start, cur_offset) it's the same as case 1).
+ * But for range [cur_offset, cow_end), the folios have dirty flag
+ * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range().
+ *
+ * Thus we should not call extent_clear_unlock_delalloc() on range
+ * [cur_offset, cow_end), as the folios are already unlocked.
+ *
+ * So clear the folio dirty flags for [start, cur_offset) first.
+ */
+ if (cur_offset > start)
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+
+ /*
* If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to cow_start to ensure the COW region is unlocked
- * as well.
+ * needs to be reset to @cow_end + 1 to skip the COW range, as
+ * cow_file_range() will do the proper cleanup at error.
+ */
+ if (cow_end)
+ cur_offset = cow_end + 1;
+
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
*/
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- if (cur_offset < end)
+ if (cur_offset < end) {
+ struct extent_state *cached = NULL;
+
+ lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DEFRAG |
+ locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ }
btrfs_free_path(path);
return ret;
}
@@ -2237,8 +2386,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
- 0, NULL))
+ test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2249,40 +2397,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc)
{
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
int ret;
/*
- * The range must cover part of the @locked_page, or a return of 1
+ * The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= page_offset(locked_page) ||
- start >= page_offset(locked_page) + PAGE_SIZE));
+ ASSERT(!(end <= folio_pos(locked_folio) ||
+ start >= folio_pos(locked_folio) + folio_size(locked_folio)));
if (should_nocow(inode, start, end)) {
- ret = run_delalloc_nocow(inode, locked_page, start, end);
+ ret = run_delalloc_nocow(inode, locked_folio, start, end);
goto out;
}
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
- run_delalloc_compressed(inode, locked_page, start, end, wbc))
+ run_delalloc_compressed(inode, locked_folio, start, end, wbc))
return 1;
if (zoned)
- ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
+ ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_page, start, end, NULL,
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_page, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
@@ -2292,6 +2439,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
@@ -2330,6 +2479,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
u64 new_size, old_size;
u32 num_extents;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
@@ -2377,55 +2528,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
spin_unlock(&inode->lock);
}
-static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&inode->delalloc_inodes)) {
- list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
- root->nr_delalloc_inodes++;
- if (root->nr_delalloc_inodes == 1) {
- spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(!list_empty(&root->delalloc_root));
- list_add_tail(&root->delalloc_root,
- &fs_info->delalloc_roots);
- spin_unlock(&fs_info->delalloc_root_lock);
- }
+ ASSERT(list_empty(&inode->delalloc_inodes));
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ ASSERT(list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
spin_unlock(&root->delalloc_lock);
}
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ lockdep_assert_held(&root->delalloc_lock);
+
+ /*
+ * We may be called after the inode was already deleted from the list,
+ * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+ * and then later through btrfs_clear_delalloc_extent() while the inode
+ * still has ->delalloc_bytes > 0.
+ */
if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(list_empty(&root->delalloc_root));
+ ASSERT(!list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
-{
- spin_lock(&root->delalloc_lock);
- __btrfs_del_delalloc_inode(root, inode);
- spin_unlock(&root->delalloc_lock);
-}
-
/*
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
@@ -2435,6 +2581,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
@@ -2443,10 +2591,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
+ u64 prev_delalloc_bytes;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(inode);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, num_extents);
@@ -2459,13 +2606,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
+ prev_delalloc_bytes = inode->delalloc_bytes;
inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
inode->defrag_bytes += len;
- if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_clear_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
+ btrfs_add_delalloc_inode(inode);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2487,6 +2641,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
@@ -2500,7 +2656,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
- bool do_list = !btrfs_is_free_space_inode(inode);
+ u64 new_delalloc_bytes;
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -num_extents);
@@ -2520,7 +2676,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
return;
if (!btrfs_is_data_reloc_root(root) &&
- do_list && !(state->state & EXTENT_NORESERVE) &&
+ !btrfs_is_free_space_inode(inode) &&
+ !(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2528,11 +2685,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
- if (do_list && inode->delalloc_bytes == 0 &&
- test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_del_delalloc_inode(root, inode);
+ new_delalloc_bytes = inode->delalloc_bytes;
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_set_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
+ spin_lock(&root->delalloc_lock);
+ btrfs_del_delalloc_inode(inode);
+ spin_unlock(&root->delalloc_lock);
+ }
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
@@ -2546,44 +2712,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
}
-static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered)
-{
- u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- u64 len = bbio->bio.bi_iter.bi_size;
- struct btrfs_ordered_extent *new;
- int ret;
-
- /* Must always be called for the beginning of an ordered extent. */
- if (WARN_ON_ONCE(start != ordered->disk_bytenr))
- return -EINVAL;
-
- /* No need to split if the ordered extent covers the entire bio. */
- if (ordered->disk_num_bytes == len) {
- refcount_inc(&ordered->refs);
- bbio->ordered = ordered;
- return 0;
- }
-
- /*
- * Don't split the extent_map for NOCOW extents, as we're writing into
- * a pre-existing one.
- */
- if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
- if (ret)
- return ret;
- }
-
- new = btrfs_split_ordered_extent(ordered, len);
- if (IS_ERR(new))
- return PTR_ERR(new);
- bbio->ordered = new;
- return 0;
-}
-
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
@@ -2622,11 +2750,11 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ em = btrfs_get_extent(inode, NULL, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->disk_bytenr != EXTENT_MAP_HOLE)
goto next;
em_len = em->len;
@@ -2676,7 +2804,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
- struct page *page;
+ struct folio *folio;
struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2688,50 +2816,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- struct page *page = fixup->page;
+ struct folio *folio = fixup->folio;
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 page_start = page_offset(page);
- u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
/*
* This is similar to page_mkwrite, we need to reserve the space before
- * we take the page lock.
+ * we take the folio lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- PAGE_SIZE);
+ folio_size(folio));
again:
- lock_page(page);
+ folio_lock(folio);
/*
- * Before we queued this fixup, we took a reference on the page.
- * page->mapping may go NULL, but it shouldn't be moved to a different
+ * Before we queued this fixup, we took a reference on the folio.
+ * folio->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
- if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ if (!folio->mapping || !folio_test_dirty(folio) ||
+ !folio_test_checked(folio)) {
/*
* Unfortunately this is a little tricky, either
*
- * 1) We got here and our page had already been dealt with and
+ * 1) We got here and our folio had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
- * 2) Our page was already dealt with, but we happened to get an
+ * 2) Our folio was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
- * because the page was already dealt with we don't want to
- * mark the page with an error, so make sure we're resetting
+ * because the folio was already dealt with we don't want to
+ * mark the folio with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
- * when the page was already properly dealt with.
+ * when the folio was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(inode, PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, folio_size(folio));
btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE,
+ page_start, folio_size(folio),
true);
}
ret = 0;
@@ -2739,7 +2868,7 @@ again:
}
/*
- * We can't mess with the page state unless it is locked, so now that
+ * We can't mess with the folio state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
@@ -2748,14 +2877,14 @@ again:
lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PageOrdered(page))
+ if (folio_test_ordered(folio))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
unlock_extent(&inode->io_tree, page_start, page_end,
&cached_state);
- unlock_page(page);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -2773,7 +2902,7 @@ again:
*
* The page was dirty when we started, nothing should have cleaned it.
*/
- BUG_ON(!PageDirty(page));
+ BUG_ON(!folio_test_dirty(folio));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
@@ -2787,14 +2916,14 @@ out_page:
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
- mapping_set_error(page->mapping, ret);
- btrfs_mark_ordered_io_finished(inode, page, page_start,
- PAGE_SIZE, !ret);
- clear_page_dirty_for_io(page);
- }
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
+ mapping_set_error(folio->mapping, ret);
+ btrfs_mark_ordered_io_finished(inode, folio, page_start,
+ folio_size(folio), !ret);
+ folio_clear_dirty_for_io(folio);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ folio_unlock(folio);
+ folio_put(folio);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
@@ -2807,33 +2936,34 @@ out_page:
/*
* There are a few paths in the higher layers of the kernel that directly
- * set the page dirty bit without asking the filesystem if it is a
+ * set the folio dirty bit without asking the filesystem if it is a
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
* In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the page.
+ * the delalloc bit and make it safe to write the folio.
*/
-int btrfs_writepage_cow_fixup(struct page *page)
+int btrfs_writepage_cow_fixup(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct inode *inode = folio->mapping->host;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
- /* This page has ordered extent covering it already */
- if (PageOrdered(page))
+ /* This folio has ordered extent covering it already */
+ if (folio_test_ordered(folio))
return 0;
/*
- * PageChecked is set below when we create a fixup worker for this page,
- * don't try to create another one if we're already PageChecked()
+ * folio_checked is set below when we create a fixup worker for this
+ * folio, don't try to create another one if we're already
+ * folio_test_checked.
*
- * The extent_io writepage code will redirty the page if we send back
+ * The extent_io writepage code will redirty the foio if we send back
* EAGAIN.
*/
- if (PageChecked(page))
+ if (folio_test_checked(folio))
return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
@@ -2843,14 +2973,14 @@ int btrfs_writepage_cow_fixup(struct page *page)
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
- * takes place outside of the page lock, and we can't trust
- * page->mapping outside of the page lock.
+ * takes place outside of the folio lock, and we can't trust
+ * page->mapping outside of the folio lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
- get_page(page);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
- fixup->page = page;
+ btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
+ folio_get(folio);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
+ fixup->folio = folio;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2982,10 +3112,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
num_bytes = oe->truncated_len;
- ram_bytes = num_bytes;
- }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3001,7 +3129,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
- return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ return insert_reserved_file_extent(trans, oe->inode,
oe->file_offset, &stack_fi,
update_inode_bytes, oe->qgroup_rsv);
}
@@ -3013,7 +3141,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
- struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_inode *inode = ordered_extent->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
@@ -3072,9 +3200,14 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
+
+ ret = btrfs_insert_raid_extent(trans, ordered_extent);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
goto out;
}
@@ -3093,6 +3226,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
+ ret = btrfs_insert_raid_extent(trans, ordered_extent);
+ if (ret)
+ goto out;
+
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3113,8 +3250,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3138,12 +3280,11 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
&cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
@@ -3162,9 +3303,8 @@ out:
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
- if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
- &ordered_extent->flags))
- mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+ if (ret)
+ btrfs_mark_ordered_extent_error(ordered_extent);
if (truncated)
unwritten_start += logical_len;
@@ -3218,7 +3358,7 @@ out:
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
*/
- btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
@@ -3240,8 +3380,9 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
- !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
return btrfs_finish_one_ordered(ordered);
}
@@ -3299,7 +3440,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
if (btrfs_is_data_reloc_root(inode->root) &&
test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- 1, NULL)) {
+ NULL)) {
/* Skip the range without csum for data reloc inode */
clear_extent_bits(&inode->io_tree, file_offset, end,
EXTENT_NODATASUM);
@@ -3323,7 +3464,7 @@ zeroit:
}
/*
- * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ * Perform a delayed iput on @inode.
*
* @inode: The inode we want to perform iput on
*
@@ -3533,7 +3674,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, last_objectid, root);
+ inode = btrfs_iget(last_objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
@@ -3718,13 +3859,37 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 1;
}
+static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (WARN_ON_ONCE(inode->file_extent_tree))
+ return 0;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+ if (!S_ISREG(inode->vfs_inode.i_mode))
+ return 0;
+ if (btrfs_is_free_space_inode(inode))
+ return 0;
+
+ inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!inode->file_extent_tree)
+ return -ENOMEM;
+
+ extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
+
+ return 0;
+}
+
/*
* read an inode from the btree into the in-memory inode
*/
static int btrfs_read_locked_inode(struct inode *inode,
struct btrfs_path *in_path)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
@@ -3737,6 +3902,10 @@ static int btrfs_read_locked_inode(struct inode *inode,
bool filled = false;
int first_xattr_slot;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ return ret;
+
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3747,7 +3916,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
return -ENOMEM;
}
- memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+ btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3771,19 +3940,17 @@ static int btrfs_read_locked_inode(struct inode *inode,
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+ inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+ inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ btrfs_timespec_nsec(leaf, &inode_item->mtime));
inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_timespec_nsec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3795,7 +3962,9 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
@@ -3807,9 +3976,9 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == fs_info->generation)
+ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -3886,7 +4055,7 @@ cache_acl:
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
}
if (path != in_path)
btrfs_free_path(path);
@@ -3939,24 +4108,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
btrfs_set_token_inode_generation(&token, item,
@@ -3974,19 +4141,20 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+ btrfs_get_inode_key(inode, &key);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -4009,10 +4177,10 @@ failed:
/*
* copy everything in the in-memory inode into the btree.
*/
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -4028,23 +4196,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
- ret = btrfs_delayed_update_inode(trans, root, inode);
+ ret = btrfs_delayed_update_inode(trans, inode);
if (!ret)
btrfs_set_inode_last_trans(trans, inode);
return ret;
}
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
}
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
int ret;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret == -ENOSPC)
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
return ret;
}
@@ -4150,9 +4318,8 @@ err:
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_ctime_current(&inode->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode(trans, dir);
out:
return ret;
}
@@ -4166,7 +4333,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, inode->root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
return ret;
}
@@ -4250,9 +4417,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
/* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
- objectid = inode->root->root_key.objectid;
+ objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- objectid = inode->location.objectid;
+ objectid = inode->ref_root_id;
} else {
WARN_ON(1);
fscrypt_free_filename(&fname);
@@ -4305,7 +4472,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
- root->root_key.objectid, dir_ino,
+ btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4321,8 +4488,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
inode_inc_iversion(&dir->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -4355,7 +4522,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
+ if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
@@ -4365,7 +4532,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_release_path(path);
}
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
@@ -4385,8 +4552,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
+ if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
@@ -4398,70 +4564,32 @@ out:
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
+ struct btrfs_inode *inode;
+ u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ d_prune_aliases(&inode->vfs_inode);
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(entry) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from the inode
- * cache when its usage count hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
+ min_ino = btrfs_ino(inode) + 1;
+ /*
+ * btrfs_drop_inode() will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(&inode->vfs_inode);
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
}
- spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -4482,7 +4610,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
+ btrfs_root_id(dest));
ret = -EPERM;
goto out_up_write;
}
@@ -4490,7 +4618,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_up_write;
}
@@ -4519,11 +4647,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = PTR_ERR(trans);
goto out_release;
}
- ret = btrfs_record_root_in_trans(trans, root);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
@@ -4551,7 +4674,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4559,8 +4682,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
+ BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4569,7 +4691,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4610,9 +4732,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- int err = 0;
+ int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4626,59 +4747,61 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
- if (err)
- return err;
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(BTRFS_I(dir));
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_notrans;
}
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!err) {
+ if (!ret)
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
out:
btrfs_end_transaction(trans);
out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
- return err;
+ return ret;
}
/*
- * btrfs_truncate_block - read, zero a chunk and write a block
+ * Read, zero a chunk and write a block.
+ *
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
@@ -4701,7 +4824,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
- struct page *page;
+ struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
@@ -4733,24 +4856,28 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio)) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
}
- if (!PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto out_unlock;
}
@@ -4759,22 +4886,22 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent(io_tree, block_start, block_end, &cached_state);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -4795,15 +4922,17 @@ again:
if (!len)
len = blocksize - offset;
if (front)
- memzero_page(page, (block_start - page_offset(page)),
- offset);
+ folio_zero_range(folio, block_start - folio_pos(folio),
+ offset);
else
- memzero_page(page, (block_start - page_offset(page)) + offset,
- len);
- }
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ folio_zero_range(folio,
+ (block_start - folio_pos(folio)) + offset,
+ len);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, folio, block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4819,8 +4948,8 @@ out_unlock:
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(inode, blocksize);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
@@ -4828,9 +4957,9 @@ out:
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
- u64 offset, u64 len)
+static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -4870,7 +4999,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
btrfs_abort_transaction(trans, ret);
} else {
btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
return ret;
@@ -4894,16 +5023,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err = 0;
+ int ret = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_block(inode, oldsize, 0, 0);
- if (err)
- return err;
+ ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ if (ret)
+ return ret;
if (size <= hole_start)
return 0;
@@ -4912,10 +5041,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
&cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- block_end - cur_offset);
+ em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
if (IS_ERR(em)) {
- err = PTR_ERR(em);
+ ret = PTR_ERR(em);
em = NULL;
break;
}
@@ -4923,17 +5051,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(root, inode, cur_offset,
- hole_size);
- if (err)
+ ret = maybe_insert_hole(inode, cur_offset, hole_size);
+ if (ret)
break;
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
hole_em = alloc_extent_map();
@@ -4946,21 +5073,18 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
}
hole_em->start = cur_offset;
hole_em->len = hole_size;
- hole_em->orig_start = cur_offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
- hole_em->generation = fs_info->generation;
+ hole_em->generation = btrfs_get_fs_generation(fs_info);
- err = btrfs_replace_extent_map_range(inode, hole_em, true);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
}
next:
@@ -4972,7 +5096,7 @@ next:
}
free_extent_map(em);
unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
- return err;
+ return ret;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
@@ -4993,7 +5117,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
}
@@ -5021,14 +5146,14 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, newsize);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_wait_ordered_range(inode,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode),
ALIGN(newsize, fs_info->sectorsize),
(u64)-1);
if (ret)
@@ -5058,7 +5183,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (err)
return err;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
@@ -5228,7 +5353,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
void btrfs_evict_inode(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv = NULL;
@@ -5242,11 +5367,12 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
+ fs_info = inode_to_fs_info(inode);
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto out;
@@ -5258,7 +5384,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
goto out;
}
@@ -5430,7 +5556,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = dir->root->root_key.objectid;
+ key.objectid = btrfs_root_id(dir->root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5471,59 +5597,52 @@ out:
return err;
}
-static void inode_tree_add(struct btrfs_inode *inode)
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
{
struct btrfs_root *root = inode->root;
- struct btrfs_inode *entry;
- struct rb_node **p;
- struct rb_node *parent;
- struct rb_node *new = &inode->rb_node;
- u64 ino = btrfs_ino(inode);
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
if (inode_unhashed(&inode->vfs_inode))
- return;
- parent = NULL;
- spin_lock(&root->inode_lock);
- p = &root->inode_tree.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_inode, rb_node);
+ return 0;
- if (ino < btrfs_ino(entry))
- p = &parent->rb_left;
- else if (ino > btrfs_ino(entry))
- p = &parent->rb_right;
- else {
- WARN_ON(!(entry->vfs_inode.i_state &
- (I_WILL_FREE | I_FREEING)));
- rb_replace_node(parent, new, &root->inode_tree);
- RB_CLEAR_NODE(parent);
- spin_unlock(&root->inode_lock);
- return;
- }
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
}
- rb_link_node(new, parent, p);
- rb_insert_color(new, &root->inode_tree);
- spin_unlock(&root->inode_lock);
+
+ return 0;
}
-static void inode_tree_del(struct btrfs_inode *inode)
+static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- int empty = 0;
+ struct btrfs_inode *entry;
+ bool empty = false;
- spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&inode->rb_node)) {
- rb_erase(&inode->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&inode->rb_node);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- }
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ if (entry == inode)
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- spin_lock(&root->inode_lock);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty)
btrfs_add_dead_root(root);
}
@@ -5534,12 +5653,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
- BTRFS_I(inode)->location.objectid = args->ino;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), args->ino);
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
- BUG_ON(args->root && !BTRFS_I(inode)->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
args->ino != BTRFS_BTREE_INODE_OBJECTID)
@@ -5552,12 +5667,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == BTRFS_I(inode)->location.objectid &&
+ return args->ino == btrfs_ino(BTRFS_I(inode)) &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
- struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5566,7 +5680,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -5578,57 +5692,62 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path)
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
struct inode *inode;
+ int ret;
- inode = btrfs_iget_locked(s, ino, root);
+ inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- int ret;
+ if (!(inode->i_state & I_NEW))
+ return inode;
- ret = btrfs_read_locked_inode(inode, path);
- if (!ret) {
- inode_tree_add(BTRFS_I(inode));
- unlock_new_inode(inode);
- } else {
- iget_failed(inode);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode, this means the inode item
- * was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- inode = ERR_PTR(ret);
- }
- }
+ ret = btrfs_read_locked_inode(inode, path);
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_read_locked_inode(), this means the inode item was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto error;
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret < 0)
+ goto error;
+
+ unlock_new_inode(inode);
return inode;
+error:
+ iget_failed(inode);
+ return ERR_PTR(ret);
}
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, ino, root, NULL);
+ return btrfs_iget_path(ino, root, NULL);
}
static struct inode *new_simple_dir(struct inode *dir,
struct btrfs_key *key,
struct btrfs_root *root)
{
+ struct timespec64 ts;
struct inode *inode = new_inode(dir->i_sb);
if (!inode)
return ERR_PTR(-ENOMEM);
BTRFS_I(inode)->root = btrfs_grab_root(root);
- memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
@@ -5637,9 +5756,13 @@ static struct inode *new_simple_dir(struct inode *dir,
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = dir->i_atime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ ts = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_atime_to_ts(inode, inode_get_atime(dir));
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+
inode->i_uid = dir->i_uid;
inode->i_gid = dir->i_gid;
@@ -5662,7 +5785,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
@@ -5678,7 +5801,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, location.objectid, root);
+ inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -5702,7 +5825,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir, &location, root);
} else {
- inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ inode = btrfs_iget(location.objectid, sub_root);
btrfs_put_root(sub_root);
if (IS_ERR(inode))
@@ -5922,7 +6045,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
addr = private->filldir_buf;
path->reada = READA_FORWARD;
- put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
&ins_list, &del_list);
again:
@@ -6012,7 +6135,7 @@ nopos:
ret = 0;
err:
if (put)
- btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
+ btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
btrfs_free_path(path);
return ret;
}
@@ -6037,15 +6160,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ ret = btrfs_update_inode(trans, inode);
+ if (ret == -ENOSPC || ret == -EDQUOT) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
if (inode->delayed_node)
@@ -6061,7 +6184,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
static int btrfs_update_time(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- bool dirty = flags & ~S_VERSION;
+ bool dirty;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -6097,7 +6220,7 @@ static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.ino = BTRFS_I(inode)->location.objectid;
+ args.ino = btrfs_ino(BTRFS_I(inode));
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6197,13 +6320,13 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
+ struct timespec64 ts;
struct inode *dir = args->dir;
struct inode *inode = args->inode;
const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
- struct btrfs_key *location;
struct btrfs_path *path;
u64 objectid;
struct btrfs_inode_ref *ref;
@@ -6212,6 +6335,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_item_batch batch;
unsigned long ptr;
int ret;
+ bool xa_reserved = false;
path = btrfs_alloc_path();
if (!path)
@@ -6221,10 +6345,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
root = BTRFS_I(inode)->root;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ goto out;
+
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
- inode->i_ino = objectid;
+ btrfs_set_inode_number(BTRFS_I(inode), objectid);
+
+ ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
+ if (ret)
+ goto out;
+ xa_reserved = true;
if (args->orphan) {
/*
@@ -6239,12 +6372,21 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- /* index_cnt is ignored for everything but a dir. */
- BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -6260,11 +6402,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
ret = btrfs_insert_inode_locked(inode);
if (ret < 0) {
if (!args->orphan)
@@ -6314,9 +6451,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = inode->i_mtime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+ ts = simple_inode_init_ts(inode);
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
/*
* We're going to fill the inode item now, so at this point the inode
@@ -6363,8 +6500,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
* Subvolumes inherit properties from their parent subvolume,
* not the directory they were created in.
*/
- parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_I(dir)->root);
+ parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
@@ -6377,8 +6513,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
- ret);
+ btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/*
@@ -6393,7 +6528,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(BTRFS_I(inode));
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
+ if (WARN_ON(ret)) {
+ /* Shouldn't happen, we used xa_reserve() before. */
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6421,6 +6561,9 @@ discard:
ihold(inode);
discard_new_inode(inode);
out:
+ if (xa_reserved)
+ xa_release(&root->inodes, objectid);
+
btrfs_free_path(path);
return ret;
}
@@ -6451,7 +6594,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
index, name);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name,
@@ -6481,10 +6624,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
* values (the ones it had when the fsync was done).
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- parent_inode->vfs_inode.i_mtime =
- inode_set_ctime_current(&parent_inode->vfs_inode);
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, parent_inode);
+ ret = btrfs_update_inode(trans, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6494,7 +6637,7 @@ fail_dir_item:
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
&local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
@@ -6515,7 +6658,7 @@ fail_dir_item:
static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
@@ -6585,14 +6728,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
int err;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6635,7 +6778,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ err = btrfs_update_inode(trans, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -6678,7 +6821,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct page *page,
+ struct folio *folio,
struct btrfs_file_extent_item *item)
{
int ret;
@@ -6700,7 +6843,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
- ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
+ max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6711,36 +6855,36 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*/
if (max_size < PAGE_SIZE)
- memzero_page(page, max_size, PAGE_SIZE - max_size);
+ folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct page *page)
+ struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
- if (!page || PageUptodate(page))
+ if (!folio || folio_test_uptodate(folio))
return 0;
- ASSERT(page_offset(page) == 0);
+ ASSERT(folio_pos(folio) == 0);
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
- return uncompress_inline(path, page, fi);
+ return uncompress_inline(path, folio, fi);
copy_size = min_t(u64, PAGE_SIZE,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
if (copy_size < PAGE_SIZE)
- memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+ folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
return 0;
}
@@ -6749,7 +6893,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
*
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
- * @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
@@ -6763,8 +6906,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
+ struct folio *folio, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6787,7 +6929,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
- else if (em->block_start == EXTENT_MAP_INLINE && page)
+ else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
free_extent_map(em);
else
goto out;
@@ -6798,9 +6940,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto out;
}
em->start = EXTENT_MAP_HOLE;
- em->orig_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
em->len = (u64)-1;
- em->block_len = (u64)-1;
path = btrfs_alloc_path();
if (!path) {
@@ -6890,9 +7031,8 @@ next:
/* New extent overlaps with existing one */
em->start = start;
- em->orig_start = start;
em->len = found_key.offset - start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
goto insert;
}
@@ -6907,7 +7047,6 @@ next:
* ensured by tree-checker and inline extent creation path.
* Thus all members representing file offsets should be zero.
*/
- ASSERT(pg_offset == 0);
ASSERT(extent_start == 0);
ASSERT(em->start == 0);
@@ -6917,19 +7056,18 @@ next:
*
* Other members are not utilized for inline extents.
*/
- ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, page);
+ ret = read_inline_extent(inode, path, folio);
if (ret < 0)
goto out;
goto insert;
}
not_found:
em->start = start;
- em->orig_start = start;
em->len = len;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
insert:
ret = 0;
btrfs_release_path(path);
@@ -6942,7 +7080,7 @@ insert:
}
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
@@ -6956,84 +7094,6 @@ out:
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- const u64 start,
- const u64 len,
- const u64 orig_start,
- const u64 block_start,
- const u64 block_len,
- const u64 orig_block_len,
- const u64 ram_bytes,
- const int type)
-{
- struct extent_map *em = NULL;
- struct btrfs_ordered_extent *ordered;
-
- if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start, block_start,
- block_len, orig_block_len, ram_bytes,
- BTRFS_COMPRESS_NONE, /* compress_type */
- type);
- if (IS_ERR(em))
- goto out;
- }
- ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
- block_start, block_len, 0,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT),
- BTRFS_COMPRESS_NONE);
- if (IS_ERR(ordered)) {
- if (em) {
- free_extent_map(em);
- btrfs_drop_extent_map_range(inode, start,
- start + len - 1, false);
- }
- em = ERR_CAST(ordered);
- } else {
- ASSERT(!dio_data->ordered);
- dio_data->ordered = ordered;
- }
- out:
-
- return em;
-}
-
-static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_map *em;
- struct btrfs_key ins;
- u64 alloc_hint;
- int ret;
-
- alloc_hint = get_extent_allocation_hint(inode, start, len);
-again:
- ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
- if (ret == -EAGAIN) {
- ASSERT(btrfs_is_zoned(fs_info));
- wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
- TASK_UNINTERRUPTIBLE);
- goto again;
- }
- if (ret)
- return ERR_PTR(ret);
-
- em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
- ins.objectid, ins.offset, ins.offset,
- ins.offset, BTRFS_ORDERED_REGULAR);
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
-
- return em;
-}
-
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
@@ -7068,10 +7128,10 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict)
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
@@ -7119,8 +7179,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
- if (ram_bytes)
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
@@ -7138,162 +7196,93 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
ret = 0;
- if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
+ if (btrfs_extent_readonly(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset))
goto out;
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + nocow_args.num_bytes,
+ range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit(io_tree, offset, range_end,
- EXTENT_DELALLOC, 0, NULL);
+ ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
if (ret) {
ret = -EAGAIN;
goto out;
}
}
- if (orig_start)
- *orig_start = key.offset - nocow_args.extent_offset;
- if (orig_block_len)
- *orig_block_len = nocow_args.disk_num_bytes;
+ if (file_extent)
+ memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
- *len = nocow_args.num_bytes;
+ *len = nocow_args.file_extent.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
return ret;
}
-static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state,
- unsigned int iomap_flags)
-{
- const bool writing = (iomap_flags & IOMAP_WRITE);
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- int ret = 0;
-
- while (1) {
- if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state))
- return -EAGAIN;
- } else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
- }
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure there's no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered &&
- (!writing || !filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)))
- break;
-
- unlock_extent(io_tree, lockstart, lockend, cached_state);
-
- if (ordered) {
- if (nowait) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- break;
- }
- /*
- * If we are doing a DIO read and the ordered extent we
- * found is for a buffered write, we can not wait for it
- * to complete and retry, because if we do so we can
- * deadlock with concurrent buffered writes on page
- * locks. This happens only if our DIO read covers more
- * than one extent map, if at this point has already
- * created an ordered extent for a previous extent map
- * and locked its range in the inode's io tree, and a
- * concurrent write against that previous extent map's
- * range and this range started (we unlock the ranges
- * in the io tree only when the bios complete and
- * buffered writes always lock pages before attempting
- * to lock range in the io tree).
- */
- if (writing ||
- test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(ordered);
- else
- ret = nowait ? -EAGAIN : -ENOTBLK;
- btrfs_put_ordered_extent(ordered);
- } else {
- /*
- * We could trigger writeback for this range (and wait
- * for it to complete) and then invalidate the pages for
- * this range (through invalidate_inode_pages2_range()),
- * but that can lead us to a deadlock with a concurrent
- * call to readahead (a buffered read or a defrag call
- * triggered a readahead) on a page lock due to an
- * ordered dio extent we created before but did not have
- * yet a corresponding bio submitted (whence it can not
- * complete), which makes readahead wait for that
- * ordered extent to complete while holding a lock on
- * that page.
- */
- ret = nowait ? -EAGAIN : -ENOTBLK;
- }
-
- if (ret)
- break;
-
- cond_resched();
- }
-
- return ret;
-}
-
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type)
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type)
{
struct extent_map *em;
int ret;
+ /*
+ * Note the missing NOCOW type.
+ *
+ * For pure NOCOW writes, we should not create an io extent map, but
+ * just reusing the existing one.
+ * Only PREALLOC writes (NOCOW write into preallocated range) can
+ * create an io extent map.
+ */
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
- type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
+ switch (type) {
+ case BTRFS_ORDERED_PREALLOC:
+ /* We're only referring part of a larger preallocated extent. */
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
+ break;
+ case BTRFS_ORDERED_REGULAR:
+ /* COW results a new extent matching our file extent size. */
+ ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
+ ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
+
+ /* Since it's a new extent, we should not have any offset. */
+ ASSERT(file_extent->offset == 0);
+ break;
+ case BTRFS_ORDERED_COMPRESSED:
+ /* Must be compressed. */
+ ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
+
+ /*
+ * Encoded write can make us to refer to part of the
+ * uncompressed extent.
+ */
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
+ break;
+ }
+
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
em->start = start;
- em->orig_start = orig_start;
- em->len = len;
- em->block_len = block_len;
- em->block_start = block_start;
- em->orig_block_len = orig_block_len;
- em->ram_bytes = ram_bytes;
+ em->len = file_extent->num_bytes;
+ em->disk_bytenr = file_extent->disk_bytenr;
+ em->disk_num_bytes = file_extent->disk_num_bytes;
+ em->ram_bytes = file_extent->ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->offset = file_extent->offset;
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7305,591 +7294,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
return em;
}
-
-static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 *lenp,
- unsigned int iomap_flags)
-{
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *em = *map;
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
- struct btrfs_block_group *bg;
- bool can_nocow = false;
- bool space_reserved = false;
- u64 len = *lenp;
- u64 prev_len;
- int ret = 0;
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false, false) == 1) {
- bg = btrfs_inc_nocow_writers(fs_info, block_start);
- if (bg)
- can_nocow = true;
- }
- }
-
- prev_len = len;
- if (can_nocow) {
- struct extent_map *em2;
-
- /* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- nowait);
- if (ret < 0) {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
- btrfs_dec_nocow_writers(bg);
- if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
- ret = -EAGAIN;
- goto out;
- }
- space_reserved = true;
-
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(bg);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em2;
- em = em2;
- }
-
- if (IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
-
- dio_data->nocow_done = true;
- } else {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
-
- if (nowait) {
- ret = -EAGAIN;
- goto out;
- }
-
- /*
- * If we could not allocate data space before locking the file
- * range and we can't do a NOCOW write, then we have to fail.
- */
- if (!dio_data->data_space_reserved) {
- ret = -ENOSPC;
- goto out;
- }
-
- /*
- * We have to COW and we have already reserved data space before,
- * so now we reserve only metadata.
- */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- false);
- if (ret < 0)
- goto out;
- space_reserved = true;
-
- em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- *map = em;
- len = min(len, em->len - (start - em->start));
- if (len < prev_len)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- prev_len - len, true);
- }
-
- /*
- * We have created our ordered extent, so we can now release our reservation
- * for an outstanding extent.
- */
- btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-
- /*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
- */
- if (start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-out:
- if (ret && space_reserved) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- }
- *lenp = len;
- return ret;
-}
-
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- loff_t length, unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = iter->private;
- u64 lockstart, lockend;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
- u64 len = length;
- const u64 data_alloc_len = length;
- bool unlock_extents = false;
-
- /*
- * We could potentially fault if we have a buffer > PAGE_SIZE, and if
- * we're NOWAIT we may submit a bio for a partial range and return
- * EIOCBQUEUED, which would result in an errant short read.
- *
- * The best way to handle this would be to allow for partial completions
- * of iocb's, so we could submit the partial bio, return and fault in
- * the rest of the pages, and then submit the io for the rest of the
- * range. However we don't have that currently, so simply return
- * -EAGAIN at this point so that the normal path is used.
- */
- if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
- return -EAGAIN;
-
- /*
- * Cap the size of reads to that usually seen in buffered I/O as we need
- * to allocate a contiguous array for the checksums.
- */
- if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
-
- lockstart = start;
- lockend = start + len - 1;
-
- /*
- * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
- * enough if we've written compressed pages to this area, so we need to
- * flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk - the first flush only starts
- * compression on the data, while keeping the pages locked, so by the
- * time the second flush returns we know bios for the compressed pages
- * were submitted and finished, and the pages no longer under writeback.
- *
- * If we have a NOWAIT request and we have any pages in the range that
- * are locked, likely due to compression still in progress, we don't want
- * to block on page locks. We also don't want to block on pages marked as
- * dirty or under writeback (same as for the non-compression case).
- * iomap_dio_rw() did the same check, but after that and before we got
- * here, mmap'ed writes may have happened or buffered reads started
- * (readpage() and readahead(), which lock pages), as we haven't locked
- * the file range yet.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- if (flags & IOMAP_NOWAIT) {
- if (filemap_range_needs_writeback(inode->i_mapping,
- lockstart, lockend))
- return -EAGAIN;
- } else {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
- }
- }
-
- memset(dio_data, 0, sizeof(*dio_data));
-
- /*
- * We always try to allocate data space and must do it before locking
- * the file range, to avoid deadlocks with concurrent writes to the same
- * range if the range has several extents and the writes don't expand the
- * current i_size (the inode lock is taken in shared mode). If we fail to
- * allocate data space here we continue and later, after locking the
- * file range, we fail with ENOSPC only if we figure out we can not do a
- * NOCOW write.
- */
- if (write && !(flags & IOMAP_NOWAIT)) {
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, data_alloc_len, false);
- if (!ret)
- dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
- goto err;
- }
-
- /*
- * If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered IO, or we are doing a
- * NOWAIT read/write and we need to block.
- */
- ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
- if (ret < 0)
- goto err;
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
-
- /*
- * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
- * io. INLINE is special, and we could probably kludge it in here, but
- * it's still buffered so for safety lets just fall back to the generic
- * buffered path.
- *
- * For COMPRESSED we _have_ to read the entire extent in so we can
- * decompress it, so there will be buffering required no matter what we
- * do, so go ahead and fallback to buffered.
- *
- * We return -ENOTBLK because that's what makes DIO go ahead and go back
- * to buffered IO. Don't blame me, this is the price we pay for using
- * the generic code.
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
- em->block_start == EXTENT_MAP_INLINE) {
- free_extent_map(em);
- /*
- * If we are in a NOWAIT context, return -EAGAIN in order to
- * fallback to buffered IO. This is not only because we can
- * block with buffered IO (no support for NOWAIT semantics at
- * the moment) but also to avoid returning short reads to user
- * space - this happens if we were able to read some data from
- * previous non-compressed extents and then when we fallback to
- * buffered IO, at btrfs_file_read_iter() by calling
- * filemap_read(), we fail to fault in pages for the read buffer,
- * in which case filemap_read() returns a short read (the number
- * of bytes previously read is > 0, so it does not return -EFAULT).
- */
- ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
- goto unlock_err;
- }
-
- len = min(len, em->len - (start - em->start));
-
- /*
- * If we have a NOWAIT request and the range contains multiple extents
- * (or a mix of extents and holes), then we return -EAGAIN to make the
- * caller fallback to a context where it can do a blocking (without
- * NOWAIT) request. This way we avoid doing partial IO and returning
- * success to the caller, which is not optimal for writes and for reads
- * it can result in unexpected behaviour for an application.
- *
- * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
- * iomap_dio_rw(), we can end up returning less data then what the caller
- * asked for, resulting in an unexpected, and incorrect, short read.
- * That is, the caller asked to read N bytes and we return less than that,
- * which is wrong unless we are crossing EOF. This happens if we get a
- * page fault error when trying to fault in pages for the buffer that is
- * associated to the struct iov_iter passed to iomap_dio_rw(), and we
- * have previously submitted bios for other extents in the range, in
- * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
- * those bios have completed by the time we get the page fault error,
- * which we return back to our caller - we should only return EIOCBQUEUED
- * after we have submitted bios for all the extents in the range.
- */
- if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
- ret = -EAGAIN;
- goto unlock_err;
- }
-
- if (write) {
- ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, &len, flags);
- if (ret < 0)
- goto unlock_err;
- unlock_extents = true;
- /* Recalc len in case the new em is smaller than requested */
- len = min(len, em->len - (start - em->start));
- if (dio_data->data_space_reserved) {
- u64 release_offset;
- u64 release_len = 0;
-
- if (dio_data->nocow_done) {
- release_offset = start;
- release_len = data_alloc_len;
- } else if (len < data_alloc_len) {
- release_offset = start + len;
- release_len = data_alloc_len - len;
- }
-
- if (release_len > 0)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- release_offset,
- release_len);
- }
- } else {
- /*
- * We need to unlock only the end area that we aren't using.
- * The rest is going to be unlocked by the endio routine.
- */
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
- }
-
- if (unlock_extents)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- else
- free_extent_state(cached_state);
-
- /*
- * Translate extent map information to iomap.
- * We trim the extents (and move the addr) even though iomap code does
- * that, since we have locked only the parts we are performing I/O in.
- */
- if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- } else {
- iomap->addr = em->block_start + (start - em->start);
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
- iomap->length = len;
- free_extent_map(em);
-
- return 0;
-
-unlock_err:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
-err:
- if (dio_data->data_space_reserved) {
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, data_alloc_len);
- extent_changeset_free(dio_data->data_reserved);
- }
-
- return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- ssize_t written, unsigned int flags, struct iomap *iomap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_dio_data *dio_data = iter->private;
- size_t submitted = dio_data->submitted;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
-
- if (!write && (iomap->type == IOMAP_HOLE)) {
- /* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
- NULL);
- return 0;
- }
-
- if (submitted < length) {
- pos += submitted;
- length -= submitted;
- if (write)
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- pos, length, false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
- ret = -ENOTBLK;
- }
- if (write) {
- btrfs_put_ordered_extent(dio_data->ordered);
- dio_data->ordered = NULL;
- }
-
- if (write)
- extent_changeset_free(dio_data->data_reserved);
- return ret;
-}
-
-static void btrfs_dio_end_io(struct btrfs_bio *bbio)
-{
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_inode *inode = bbio->inode;
- struct bio *bio = &bbio->bio;
-
- if (bio->bi_status) {
- btrfs_warn(inode->root->fs_info,
- "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
- btrfs_ino(inode), bio->bi_opf,
- dip->file_offset, dip->bytes, bio->bi_status);
- }
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- btrfs_finish_ordered_extent(bbio->ordered, NULL,
- dip->file_offset, dip->bytes,
- !bio->bi_status);
- } else {
- unlock_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
- }
-
- bbio->bio.bi_private = bbio->private;
- iomap_dio_bio_end_io(bio);
-}
-
-static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset)
-{
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_dio_data *dio_data = iter->private;
-
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
- btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
-
- dip->file_offset = file_offset;
- dip->bytes = bio->bi_iter.bi_size;
-
- dio_data->submitted += bio->bi_iter.bi_size;
-
- /*
- * Check if we are doing a partial write. If we are, we need to split
- * the ordered extent to match the submitted bio. Hang on to the
- * remaining unfinishable ordered_extent in dio_data so that it can be
- * cancelled in iomap_end to avoid a deadlock wherein faulting the
- * remaining pages is blocked on the outstanding ordered extent.
- */
- if (iter->flags & IOMAP_WRITE) {
- int ret;
-
- ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
- if (ret) {
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- file_offset, dip->bytes,
- !ret);
- bio->bi_status = errno_to_blk_status(ret);
- iomap_dio_bio_end_io(bio);
- return;
- }
- }
-
- btrfs_submit_bio(bbio, 0);
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
- .iomap_begin = btrfs_dio_iomap_begin,
- .iomap_end = btrfs_dio_iomap_end,
-};
-
-static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_dio_submit_io,
- .bio_set = &btrfs_dio_bioset,
-};
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
- int ret;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, 0);
- if (ret)
- return ret;
-
- /*
- * fiemap_prep() called filemap_write_and_wait() for the whole possible
- * file range (0 to LLONG_MAX), but that is not enough if we have
- * compression enabled. The first filemap_fdatawrite_range() only kicks
- * in the compression of data (in an async thread) and will return
- * before the compression is done and writeback is started. A second
- * filemap_fdatawrite_range() is needed to wait for the compression to
- * complete and writeback to start. We also need to wait for ordered
- * extents to complete, because our fiemap implementation uses mainly
- * file extent items to list the extents, searching for extent maps
- * only for file ranges with holes or prealloc extents to figure out
- * if we have delalloc in those ranges.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- /*
- * We did an initial flush to avoid holding the inode's lock while
- * triggering writeback and waiting for the completion of IO and ordered
- * extents. Now after we locked the inode we do it again, because it's
- * possible a new write may have happened in between those two steps.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret) {
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
- return ret;
- }
- }
-
- ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- return ret;
-}
-
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return extent_writepages(mapping, wbc);
-}
-
-static void btrfs_readahead(struct readahead_control *rac)
-{
- extent_readahead(rac);
-}
-
/*
* For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -7897,16 +7301,16 @@ static void btrfs_readahead(struct readahead_control *rac)
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
*/
-static void wait_subpage_spinlock(struct page *page)
+static void wait_subpage_spinlock(struct folio *folio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7923,15 +7327,20 @@ static void wait_subpage_spinlock(struct page *page)
spin_unlock_irq(&subpage->lock);
}
-static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+static int btrfs_launder_folio(struct folio *folio)
{
- int ret = try_release_extent_mapping(&folio->page, gfp_flags);
+ return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
+ PAGE_SIZE, NULL);
+}
- if (ret == 1) {
- wait_subpage_spinlock(&folio->page);
- clear_page_extent_mapped(&folio->page);
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+{
+ if (try_release_extent_mapping(folio, gfp_flags)) {
+ wait_subpage_spinlock(folio);
+ clear_folio_extent_mapped(folio);
+ return true;
}
- return ret;
+ return false;
}
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -7965,7 +7374,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(folio);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
@@ -7988,7 +7397,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* do double ordered extent accounting on the same folio.
*/
folio_wait_writeback(folio);
- wait_subpage_spinlock(&folio->page);
+ wait_subpage_spinlock(folio);
/*
* For subpage case, we have call sites like
@@ -8044,7 +7453,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8053,7 +7462,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8069,11 +7478,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
/*
* If the ordered extent has finished, we're safe to delete all
@@ -8123,174 +7532,10 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
- clear_page_extent_mapped(&folio->page);
-}
-
-/*
- * btrfs_page_mkwrite() is not allowed to change the file size as it gets
- * called from a page fault handler when a page is first dirtied. Hence we must
- * be careful to check for EOF conditions here. We set the page up correctly
- * for a written page which means we get ENOSPC checking when writing into
- * holes and correct delalloc and unwritten extent mapping on filesystems that
- * support these features.
- *
- * We are not allowed to take the i_mutex here so we have to play games to
- * protect against truncate races as the page could now be beyond EOF. Because
- * truncate_setsize() writes the inode size before removing pages, once we have
- * the page lock we can determine safely if the page is beyond EOF. If it is not
- * beyond EOF, then the page is guaranteed safe against truncation until we
- * unlock the page.
- */
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- unsigned long zero_start;
- loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
- u64 reserved_space;
- u64 page_start;
- u64 page_end;
- u64 end;
-
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- end = page_end;
-
- /*
- * Reserving delalloc space after obtaining the page lock can lead to
- * deadlock. For example, if a dirty page is locked by this function
- * and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepages() function could
- * end up waiting indefinitely to get a lock on the page currently
- * being processed by btrfs_page_mkwrite() function.
- */
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
- goto out_noreserve;
- }
-
- ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
-again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
- size = i_size_read(inode);
-
- if ((page->mapping != inode->i_mapping) ||
- (page_start >= size)) {
- /* page got truncated out from underneath us */
- goto out_unlock;
- }
- wait_on_page_writeback(page);
-
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- goto out_unlock;
- }
-
- /*
- * we can't set the delalloc bits if there are pending ordered
- * extents. Drop our locks and wait for them to finish
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start,
- fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
- end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
- }
- }
-
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
-
- /* page is wholly or partially inside EOF */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
- else
- zero_start = PAGE_SIZE;
-
- if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
-
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
-
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
-
-out_unlock:
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return ret;
+ clear_folio_extent_mapped(folio);
}
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
@@ -8310,7 +7555,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ ret = btrfs_wait_ordered_range(inode,
inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
@@ -8403,7 +7648,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -8456,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, inode);
if (ret2 && !ret)
ret = ret2;
@@ -8527,8 +7772,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->disk_i_size = 0;
ei->flags = 0;
ei->ro_flags = 0;
+ /*
+ * ->index_cnt will be properly initialized later when creating a new
+ * inode (btrfs_create_new_inode()) or when reading an existing inode
+ * from disk (btrfs_read_locked_inode()).
+ */
ei->csum_bytes = 0;
- ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_reflink_trans = 0;
@@ -8545,20 +7794,24 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
- ei->i_otime.tv_sec = 0;
- ei->i_otime.tv_nsec = 0;
+ ei->i_otime_sec = 0;
+ ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = NULL;
+
mutex_init(&ei->log_mutex);
- btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ spin_lock_init(&ei->ordered_tree_lock);
+ ei->ordered_tree = RB_ROOT;
+ ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
- RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
@@ -8568,12 +7821,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -8592,9 +7847,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!S_ISDIR(vfs_inode->i_mode)) {
WARN_ON(inode->delalloc_bytes);
WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
}
- WARN_ON(inode->csum_bytes);
- WARN_ON(inode->defrag_bytes);
+ if (!root || !btrfs_is_data_reloc_root(root))
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8628,7 +7884,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
}
btrfs_qgroup_check_reserved_leak(inode);
- inode_tree_del(inode);
+ btrfs_del_inode_from_root(inode);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
@@ -8662,7 +7918,6 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
- bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
}
@@ -8670,20 +7925,12 @@ int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
- goto fail;
-
- if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_dio_private, bbio.bio),
- BIOSET_NEED_BVECS))
- goto fail;
+ return -ENOMEM;
return 0;
-fail:
- btrfs_destroy_cachep();
- return -ENOMEM;
}
static int btrfs_getattr(struct mnt_idmap *idmap,
@@ -8698,8 +7945,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
- stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
- stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
if (bi_flags & BTRFS_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (bi_flags & BTRFS_INODE_COMPRESS)
@@ -8719,6 +7966,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
@@ -8733,7 +7983,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_dir,
struct dentry *new_dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -8749,6 +7999,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8872,6 +8123,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8887,7 +8163,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8902,7 +8178,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8929,30 +8205,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8985,7 +8254,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_new_inode_args whiteout_args = {
.dir = old_dir,
.dentry = old_dentry,
@@ -9002,6 +8271,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9136,6 +8406,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -9147,7 +8440,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9184,7 +8477,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -9200,6 +8493,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9272,7 +8569,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
return work;
}
@@ -9427,7 +8724,7 @@ out:
static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
@@ -9598,7 +8895,7 @@ free_qgroup:
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid, qgroup_released,
+ btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
@@ -9608,7 +8905,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -9673,13 +8970,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
em->start = cur_offset;
- em->orig_start = cur_offset;
em->len = ins.offset;
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->orig_block_len = ins.offset;
+ em->disk_bytenr = ins.objectid;
+ em->offset = 0;
+ em->disk_num_bytes = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9703,7 +8999,7 @@ next:
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9760,7 +9056,7 @@ static int btrfs_permission(struct mnt_idmap *idmap,
static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode;
@@ -9823,17 +9119,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->vfs_inode.i_mapping, index);
- ASSERT(page); /* Pages should be in the extent_io_tree */
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+ ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
- put_page(page);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(folio) == 0);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ folio_put(folio);
index++;
}
}
@@ -9970,26 +9268,31 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
+ if (atomic_dec_and_test(&priv->pending))
wake_up(&priv->wait);
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv;
unsigned long i = 0;
struct btrfs_bio *bbio;
+ int ret;
+
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
- init_waitqueue_head(&priv.wait);
+ init_waitqueue_head(&priv->wait);
+ atomic_set(&priv->pending, 1);
+ priv->status = 0;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -9997,11 +9300,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ atomic_inc(&priv->pending);
+ btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -10012,22 +9315,22 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ atomic_inc(&priv->pending);
+ btrfs_submit_bbio(bbio, 0);
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ if (atomic_dec_return(&priv->pending))
+ io_wait_event(priv->wait, !atomic_read(&priv->pending));
/* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ kfree(priv);
+ return ret;
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -10041,13 +9344,13 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, false);
if (ret) {
ret = -ENOMEM;
goto out;
}
- ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
disk_io_size, pages);
if (ret)
goto out;
@@ -10088,15 +9391,16 @@ out:
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
bool unlocked = false;
@@ -10118,27 +9422,27 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ ret = btrfs_wait_ordered_range(inode, start,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
cond_resched();
}
- em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock_extent;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
u64 extent_start = em->start;
/*
@@ -10148,7 +9452,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
goto out;
}
@@ -10159,61 +9463,58 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- disk_bytenr = em->block_start;
+ } else if (extent_map_is_compressed(em)) {
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
*/
- if (em->block_len > count) {
+ if (em->disk_num_bytes > count) {
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->block_len;
- count = em->block_len;
+ *disk_io_size = em->disk_num_bytes;
+ count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
- encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = em->block_start + (start - em->start);
+ *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_em;
}
out:
@@ -10222,10 +9523,11 @@ out:
out_em:
free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read. */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -10240,12 +9542,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
int compression;
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_pages, i;
- struct page **pages;
+ unsigned long nr_folios, i;
+ struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -10334,24 +9637,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
- if (!pages)
+ nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!folios)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;
- pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pages[i]) {
+ folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!folios[i]) {
ret = -ENOMEM;
- goto out_pages;
+ goto out_folios;
}
- kaddr = kmap_local_page(pages[i]);
+ kaddr = kmap_local_folio(folios[i], 0);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT;
- goto out_pages;
+ goto out_folios;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10361,14 +9664,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
- goto out_pages;
+ goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_pages;
+ goto out_folios;
lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
@@ -10396,10 +9699,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_qgroup_free_data;
/* Try an inline extent first. */
- if (start == 0 && encoded->unencoded_len == encoded->len &&
- encoded->unencoded_offset == 0) {
- ret = cow_file_range_inline(inode, encoded->len, orig_count,
- compression, pages, true);
+ if (encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0 &&
+ can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
+ ret = __cow_file_range_inline(inode, start, encoded->len,
+ orig_count, compression, folios[0],
+ true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
@@ -10413,22 +9718,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_delalloc_release;
extent_reserved = true;
- em = create_io_em(inode, start, num_bytes,
- start - encoded->unencoded_offset, ins.objectid,
- ins.offset, ins.offset, ram_bytes, compression,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = num_bytes;
+ file_extent.ram_bytes = ram_bytes;
+ file_extent.offset = encoded->unencoded_offset;
+ file_extent.compression = compression;
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserved;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
- ins.objectid, ins.offset,
- encoded->unencoded_offset,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED),
- compression);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -10443,7 +9748,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count;
goto out;
@@ -10465,12 +9770,12 @@ out_free_data_space:
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i])
- __free_page(pages[i]);
+out_folios:
+ for (i = 0; i < nr_folios; i++) {
+ if (folios[i])
+ folio_put(folios[i]);
}
- kvfree(pages);
+ kvfree(folios);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
@@ -10617,38 +9922,59 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
+ struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
+ struct btrfs_path *path = NULL;
int ret = 0;
u64 isize;
- u64 start;
+ u64 prev_extent_end = 0;
+
+ /*
+ * Acquire the inode's mmap lock to prevent races with memory mapped
+ * writes, as they could happen after we flush delalloc below and before
+ * we lock the extent range further below. The inode was already locked
+ * up in the call chain.
+ */
+ btrfs_assert_inode_locked(BTRFS_I(inode));
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
/*
* If the swap file was just created, make sure delalloc is done. If the
* file changes again after this, the user is doing something stupid and
* we don't really care.
*/
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- return ret;
+ goto out_unlock_mmap;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
+ }
+
+ path = btrfs_alloc_path();
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ if (!path || !backref_ctx) {
+ ret = -ENOMEM;
+ goto out_unlock_mmap;
}
/*
@@ -10663,7 +9989,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock_mmap;
}
/*
@@ -10677,7 +10004,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
@@ -10697,8 +10025,9 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
- root->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(root));
+ ret = -EPERM;
+ goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
@@ -10706,24 +10035,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
lock_extent(io_tree, 0, isize - 1, &cached_state);
- start = 0;
- while (start < isize) {
- u64 logical_block_start, physical_block_start;
+ while (prev_extent_end < isize) {
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
struct btrfs_block_group *bg;
- u64 len = isize - start;
+ u64 logical_block_start;
+ u64 physical_block_start;
+ u64 extent_gen;
+ u64 disk_bytenr;
+ u64 len;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = prev_extent_end;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
goto out;
- }
- if (em->block_start == EXTENT_MAP_HOLE) {
+ /*
+ * If key not found it means we have an implicit hole (NO_HOLES
+ * is enabled).
+ */
+ if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -10735,36 +10079,58 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+
+ if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
- logical_block_start = em->block_start + (start - em->start);
- len = min(len, em->len - (start - em->start));
- free_extent_map(em);
- em = NULL;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (disk_bytenr == 0) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+ prev_extent_end = btrfs_file_extent_end(path);
+
+ if (prev_extent_end > isize)
+ len = isize - key.offset;
+ else
+ len = btrfs_file_extent_num_bytes(leaf, ei);
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
+ /*
+ * Don't need the path anymore, release to avoid deadlocks when
+ * calling btrfs_is_data_extent_shared() because when joining a
+ * transaction it can block waiting for the current one's commit
+ * which in turn may be trying to lock the same leaf to flush
+ * delayed items for example.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
+ extent_gen, backref_ctx);
if (ret < 0) {
goto out;
- } else if (ret) {
- ret = 0;
- } else {
+ } else if (ret > 0) {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10772,23 +10138,22 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10827,20 +10192,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
goto out;
}
- bsi.start = start;
+ bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
- start += len;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
- if (!IS_ERR_OR_NULL(em))
- free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10851,6 +10221,10 @@ out:
btrfs_exclop_finish(fs_info);
+out_unlock_mmap:
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
if (ret)
return ret;
@@ -10921,7 +10295,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
if (ordered) {
btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
- start, end, btrfs_ino(inode), root->root_key.objectid,
+ start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
@@ -10930,6 +10304,36 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
ASSERT(ordered == NULL);
}
+/*
+ * Find the first inode with a minimum number.
+ *
+ * @root: The root to search for.
+ * @min_ino: The minimum inode number.
+ *
+ * Find the first inode in the @root with a number >= @min_ino and return it.
+ * Returns NULL if no such inode found.
+ */
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10982,10 +10386,11 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.invalidate_folio = btrfs_invalidate_folio,
+ .launder_folio = btrfs_launder_folio,
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ae6806bc3929..1706f6d9b12e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,11 +34,9 @@
#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
#include "backref.h"
-#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
#include "props.h"
@@ -47,9 +45,7 @@
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
-#include "subpage.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
return 0;
}
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
+{
+ if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
+static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
+{
+ if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
/*
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
@@ -247,7 +257,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
@@ -365,15 +375,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
- strlen(comp), 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
- 0, 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -385,7 +395,7 @@ update_flags:
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
@@ -540,7 +550,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return ret;
}
-int __pure btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(const u8 *uuid)
{
int i;
@@ -582,7 +592,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
struct btrfs_qgroup_inherit *inherit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item *root_item;
@@ -646,22 +656,17 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = PTR_ERR(trans);
goto out_release_rsv;
}
- ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
- if (ret)
- goto out;
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- /* Tree log can't currently deal with an inode which is a new root. */
- btrfs_set_log_full_commit(trans);
- ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto out;
@@ -753,6 +758,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -786,7 +793,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
unsigned int trans_num_items;
@@ -846,7 +853,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
- pending_snapshot->dir = dir;
+ pending_snapshot->dir = BTRFS_I(dir);
pending_snapshot->inherit = inherit;
trans = btrfs_start_transaction(root, 0);
@@ -931,7 +938,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
if (d_really_is_negative(victim))
return -ENOENT;
- BUG_ON(d_inode(victim->d_parent) != dir);
+ /* The @victim is not inside @dir. */
+ if (d_inode(victim->d_parent) != dir)
+ return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
@@ -983,7 +992,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
int error;
@@ -1060,7 +1069,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
@@ -1118,7 +1127,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 new_size;
u64 old_size;
u64 devid = 1;
@@ -1149,7 +1158,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = PTR_ERR(vol_args);
goto out_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
sizestr = vol_args->name;
cancel = (strcmp("cancel", sizestr) == 0);
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
@@ -1298,12 +1310,12 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
- if (!src.file) {
+ if (!fd_file(src)) {
ret = -EINVAL;
goto out_drop_write;
}
- src_inode = file_inode(src.file);
+ src_inode = file_inode(fd_file(src));
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
@@ -1349,12 +1361,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
vol_args->name, vol_args->fd, subvol,
false, NULL);
+out:
kfree(vol_args);
return ret;
}
@@ -1373,7 +1388,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto free_args;
if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
@@ -1383,7 +1400,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- u64 nums;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
if (vol_args->size < sizeof(*inherit) ||
vol_args->size > PAGE_SIZE) {
@@ -1396,19 +1413,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
goto free_args;
}
- if (inherit->num_qgroups > PAGE_SIZE ||
- inherit->num_ref_copies > PAGE_SIZE ||
- inherit->num_excl_copies > PAGE_SIZE) {
- ret = -EINVAL;
- goto free_inherit;
- }
-
- nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
- 2 * inherit->num_excl_copies;
- if (vol_args->size != struct_size(inherit, qgroups, nums)) {
- ret = -EINVAL;
+ ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
+ if (ret < 0)
goto free_inherit;
- }
}
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
@@ -1426,7 +1433,7 @@ free_args:
static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
@@ -1449,7 +1456,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
@@ -1502,7 +1509,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
spin_unlock(&root->root_item_lock);
btrfs_warn(fs_info,
"Attempt to set subvolume %llu read-write during send",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_drop_sem;
}
@@ -1696,7 +1703,7 @@ static noinline int search_ioctl(struct inode *inode,
u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *info = inode_to_fs_info(inode);
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1909,9 +1916,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct super_block *sb = inode->i_sb;
- struct btrfs_key upper_limit = BTRFS_I(inode)->location;
- u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 upper_limit = btrfs_ino(BTRFS_I(inode));
+ u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
unsigned long item_len;
@@ -1936,7 +1942,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* If the bottom subvolume does not exist directly under upper_limit,
* construct the path in from the bottom up.
*/
- if (dirid != upper_limit.objectid) {
+ if (dirid != upper_limit) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
root = btrfs_get_fs_root(fs_info, treeid, true);
@@ -1998,7 +2004,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* btree and lock the same leaf.
*/
btrfs_release_path(path);
- temp_inode = btrfs_iget(sb, key2.objectid, root);
+ temp_inode = btrfs_iget(key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out_put;
@@ -2011,7 +2017,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
- if (key.offset == upper_limit.objectid)
+ if (key.offset == upper_limit)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
@@ -2083,7 +2089,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = root->root_key.objectid;
+ args->treeid = btrfs_root_id(root);
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2132,7 +2138,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
inode = file_inode(file);
if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
- BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
/*
* The subvolume does not exist under fd with which this is
* called
@@ -2179,7 +2185,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -2294,7 +2300,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
return PTR_ERR(rootrefs);
}
- objectid = root->root_key.objectid;
+ objectid = btrfs_root_id(root);
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
@@ -2367,9 +2373,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
- struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
struct dentry *dentry;
struct inode *dir = d_inode(parent);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
@@ -2378,7 +2384,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
int subvol_namelen;
- int err = 0;
+ int ret = 0;
bool destroy_parent = false;
/* We don't support snapshots with extent tree v2 yet. */
@@ -2394,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
return PTR_ERR(vol_args2);
if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -2403,29 +2409,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* name, same as v1 currently does.
*/
if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
- vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
+ if (ret < 0)
+ goto out;
subvol_name = vol_args2->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
} else {
struct inode *old_dir;
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_drop_write;
}
@@ -2445,7 +2453,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
dput(dentry);
if (IS_ERR(parent)) {
- err = PTR_ERR(parent);
+ ret = PTR_ERR(parent);
goto out_drop_write;
}
old_dir = dir;
@@ -2469,14 +2477,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* to delete without an idmapped mount.
*/
if (old_dir != dir && idmap != &nop_mnt_idmap) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto free_parent;
}
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
fs_info, vol_args2->subvolid);
if (IS_ERR(subvol_name_ptr)) {
- err = PTR_ERR(subvol_name_ptr);
+ ret = PTR_ERR(subvol_name_ptr);
goto free_parent;
}
/* subvol_name_ptr is already nul terminated */
@@ -2487,11 +2495,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
+
subvol_name = vol_args->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
}
@@ -2499,26 +2510,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (strchr(subvol_name, '/') ||
strncmp(subvol_name, "..", subvol_namelen) == 0) {
- err = -EINVAL;
+ ret = -EINVAL;
goto free_subvol_name;
}
if (!S_ISDIR(dir->i_mode)) {
- err = -ENOTDIR;
+ ret = -ENOTDIR;
goto free_subvol_name;
}
- err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (err == -EINTR)
+ ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (ret == -EINTR)
goto free_subvol_name;
dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_unlock_dir;
}
if (d_really_is_negative(dentry)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out_dput;
}
@@ -2538,7 +2549,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* Users who want to delete empty subvols should try
* rmdir(2).
*/
- err = -EPERM;
+ ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
@@ -2549,29 +2560,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* of the subvol, not a random directory contained
* within it.
*/
- err = -EINVAL;
+ ret = -EINVAL;
if (root == dest)
goto out_dput;
- err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
- if (err)
+ ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
+ if (ret)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(idmap, dir, dentry, 1);
- if (err)
+ ret = btrfs_may_delete(idmap, dir, dentry, 1);
+ if (ret)
goto out_dput;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out_dput;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
- err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
btrfs_inode_unlock(BTRFS_I(inode), 0);
- if (!err)
+ if (!ret)
d_delete_notify(dir, dentry);
out_dput:
@@ -2588,7 +2599,7 @@ out_drop_write:
out:
kfree(vol_args2);
kfree(vol_args);
- return err;
+ return ret;
}
static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
@@ -2672,6 +2683,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
return -EINVAL;
}
+ if (fs_info->fs_devices->temp_fsid) {
+ btrfs_err(fs_info,
+ "device add not supported on cloned temp-fsid mount");
+ return -EINVAL;
+ }
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2692,12 +2709,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
btrfs_info(fs_info, "disk added %s", vol_args->name);
+out_free:
kfree(vol_args);
out:
if (restore_op)
@@ -2711,10 +2732,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2730,7 +2750,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto out;
+
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
args.devid = vol_args->devid;
} else if (!strcmp("cancel", vol_args->name)) {
@@ -2751,7 +2774,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2765,8 +2788,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2777,10 +2800,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2791,7 +2813,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
if (!strcmp("cancel", vol_args->name)) {
cancel = true;
} else {
@@ -2807,17 +2832,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
+out_free:
kfree(vol_args);
return ret;
}
@@ -2859,7 +2885,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
- fi_args->generation = fs_info->generation;
+ fi_args->generation = btrfs_get_fs_generation(fs_info);
fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
}
@@ -2921,7 +2947,7 @@ out:
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
@@ -2953,7 +2979,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->root_key.objectid)) {
+ if (!is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3168,7 +3194,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
return PTR_ERR(trans);
/* No running transaction, don't bother */
- transid = root->fs_info->last_trans_committed;
+ transid = btrfs_get_last_trans_committed(root->fs_info);
goto out;
}
transid = trans->transid;
@@ -3195,7 +3221,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
struct btrfs_ioctl_scrub_args *sa;
int ret;
@@ -3713,7 +3739,7 @@ out:
static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_quota_ctl_args *sa;
int ret;
@@ -3730,14 +3756,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
goto drop_write;
}
- down_write(&fs_info->subvol_sem);
-
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
- ret = btrfs_quota_enable(fs_info);
+ case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+ down_write(&fs_info->subvol_sem);
+ ret = btrfs_quota_enable(fs_info, sa);
+ up_write(&fs_info->subvol_sem);
break;
case BTRFS_QUOTA_CTL_DISABLE:
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent
+ * relocation, because relocation may be building backrefs for
+ * blocks of the quota root while we are deleting the root. This
+ * is like dropping fs roots of deleted snapshots/subvolumes, we
+ * need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to
+ * disable quotas, because we will unlock and relock
+ * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ *
+ * We take this here because we have the dependency of
+ *
+ * inode_lock -> subvol_sem
+ *
+ * because of rename. With relocation we can prealloc extents,
+ * so that makes the dependency chain
+ *
+ * cleaner_mutex -> inode_lock -> subvol_sem
+ *
+ * so we must take the cleaner_mutex here before we take the
+ * subvol_sem. The deadlock can't actually happen, but this
+ * quiets lockdep.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+ down_write(&fs_info->subvol_sem);
ret = btrfs_quota_disable(fs_info);
+ up_write(&fs_info->subvol_sem);
+ mutex_unlock(&fs_info->cleaner_mutex);
break;
default:
ret = -EINVAL;
@@ -3745,18 +3800,34 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
}
kfree(sa);
- up_write(&fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
}
+/*
+ * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
+ * done before any operations.
+ */
+static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+ bool ret = true;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ ret = false;
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ return ret;
+}
+
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_qgroup_list *prealloc = NULL;
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3764,6 +3835,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3774,14 +3848,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
goto drop_write;
}
+ if (sa->assign) {
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto drop_write;
+ }
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
+ /*
+ * Prealloc ownership is moved to the relation handler, there it's used
+ * or freed on error.
+ */
if (sa->assign) {
- ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
+ ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc);
+ prealloc = NULL;
} else {
ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
}
@@ -3791,13 +3878,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
err = btrfs_run_qgroups(trans);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
- btrfs_handle_fs_error(fs_info, err,
- "failed to update qgroup status and info");
+ btrfs_warn(fs_info,
+ "qgroup status update failed after %s relation, marked as inconsistent",
+ sa->assign ? "adding" : "deleting");
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
+ kfree(prealloc);
kfree(sa);
drop_write:
mnt_drop_write_file(file);
@@ -3816,6 +3905,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3872,6 +3964,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3891,7 +3986,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
qgroupid = sa->qgroupid;
if (!qgroupid) {
/* take the current subvol as qgroup */
- qgroupid = root->root_key.objectid;
+ qgroupid = btrfs_root_id(root);
}
ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
@@ -3910,13 +4005,16 @@ drop_write:
static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3974,7 +4072,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
@@ -4022,7 +4120,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
!btrfs_is_empty_uuid(root_item->received_uuid)) {
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4046,7 +4144,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret < 0 && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4162,7 +4260,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_trans_handle *trans;
@@ -4305,7 +4403,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags flags[2];
@@ -4373,7 +4471,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4416,12 +4514,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags);
size_t copy_end;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
ssize_t ret;
+ u64 disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4474,7 +4577,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+
+ if (ret == -EIOCBQUEUED) {
+ bool unlocked = false;
+ u64 start, lockend, count;
+
+ start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ if (args.compression)
+ count = disk_io_size;
+ else
+ count = args.len;
+
+ ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, &unlocked);
+
+ if (!unlocked) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ }
+
if (ret >= 0) {
fsnotify_access(file);
if (copy_to_user(argp + copy_end,
@@ -4561,29 +4689,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (ret < 0)
goto out_acct;
- file_start_write(file);
-
if (iov_iter_count(&iter) == 0) {
ret = 0;
- goto out_end_write;
+ goto out_iov;
}
pos = args.offset;
ret = rw_verify_area(WRITE, file, &pos, args.len);
if (ret < 0)
- goto out_end_write;
+ goto out_iov;
init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
- goto out_end_write;
+ goto out_iov;
kiocb.ki_pos = pos;
+ file_start_write(file);
+
ret = btrfs_do_write_iter(&kiocb, &iter, &args);
if (ret > 0)
fsnotify_modify(file);
-out_end_write:
file_end_write(file);
+out_iov:
kfree(iov);
out_acct:
if (ret > 0)
@@ -4596,7 +4724,7 @@ long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
void __user *argp = (void __user *)arg;
@@ -4665,11 +4793,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
/*
- * The transaction thread may want to do more work,
- * namely it pokes the cleaner kthread that will start
- * processing uncleaned subvols.
+ * There may be work for the cleaner kthread to do (subvolume
+ * deletion, delayed iputs, defrag inodes, etc), so wake it up.
*/
- wake_up_process(fs_info->transaction_kthread);
+ wake_up_process(fs_info->cleaner_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
@@ -4695,10 +4822,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(inode, argp, false);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(inode, argp, true);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index d51b9a2f2f6e..19cd26b0244a 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_IOCTL_H
#define BTRFS_IOCTL_H
+#include <linux/types.h>
+
+struct file;
+struct dentry;
+struct mnt_idmap;
+struct fileattr;
+struct btrfs_fs_info;
+struct btrfs_ioctl_balance_args;
+
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
@@ -10,7 +19,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7979449a58d6..6a0b7abb5bd9 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,11 +8,11 @@
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
+#include <trace/events/btrfs.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
-#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
@@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+ { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
{ .id = 0, DEFINE_NAME("tree") },
};
@@ -83,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
{
struct btrfs_lockdep_keyset *ks;
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
+ ASSERT(level < ARRAY_SIZE(ks->keys));
/* Find the matching keyset, id 0 is the default entry */
for (ks = btrfs_lockdep_keysets; ks->id; ks++)
@@ -96,12 +97,21 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
{
if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
- btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ btrfs_set_buffer_lockdep_class(btrfs_root_id(root),
eb, btrfs_header_level(eb));
}
#endif
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+ eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
/*
* Extent buffer locking
* =====================
@@ -119,14 +129,14 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
*/
/*
- * __btrfs_tree_read_lock - lock extent buffer for read
+ * btrfs_tree_read_lock_nested - lock extent buffer for read
* @eb: the eb to be locked
* @nest: the nesting level to be used for lockdep
*
* This takes the read lock on the extent buffer, using the specified nesting
* level for lockdep purposes.
*/
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
@@ -137,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
trace_btrfs_tree_read_lock(eb, start_ns);
}
-void btrfs_tree_read_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Try-lock for read.
*
@@ -164,7 +169,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (down_write_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_try_tree_write_lock(eb);
return 1;
}
@@ -181,13 +186,14 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
}
/*
- * __btrfs_tree_lock - lock eb for write
+ * Lock eb for write.
+ *
* @eb: the eb to lock
* @nest: the nesting to use for the lock
*
* Returns with the eb->lock write locked.
*/
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -196,22 +202,17 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
start_ns = ktime_get_ns();
down_write_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_tree_lock(eb, start_ns);
}
-void btrfs_tree_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Release the write lock.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_unlock(eb);
- eb->lock_owner = 0;
+ btrfs_set_eb_lock_owner(eb, 0);
up_write(&eb->lock);
}
@@ -363,8 +364,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
- atomic_dec(&lock->writers);
- cond_wake_up(&lock->pending_readers);
+ /*
+ * atomic_dec_and_test() implies a full barrier, so woken up readers are
+ * guaranteed to see the decrement.
+ */
+ if (atomic_dec_and_test(&lock->writers))
+ wake_up(&lock->pending_readers);
}
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 7d6ee1e609bf..3c15c75e0582 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -8,9 +8,14 @@
#include <linux/atomic.h>
#include <linux/wait.h>
+#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
+struct extent_buffer;
+struct btrfs_path;
+struct btrfs_root;
+
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
@@ -157,14 +162,22 @@ enum btrfs_lockdep_trans_states {
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
-struct btrfs_path;
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
+
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
*
* @cache: The cache.
* @max_size: Maximum size (number of entries) for the cache.
- * Use 0 for unlimited size, it's the user's responsability to
+ * Use 0 for unlimited size, it's the user's responsibility to
* trim the cache in that case.
*/
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index 00328c856be6..07f1bb1c6aa3 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_LRU_CACHE_H
#define BTRFS_LRU_CACHE_H
+#include <linux/types.h>
#include <linux/maple_tree.h>
#include <linux/list.h>
@@ -50,11 +51,6 @@ struct btrfs_lru_cache {
#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \
list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
-static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
-{
- return cache->size;
-}
-
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
struct btrfs_lru_cache *cache)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..72856f6775f7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf)
*/
static int copy_compressed_data_to_page(char *compressed_data,
size_t compressed_size,
- struct page **out_pages,
- unsigned long max_nr_page,
+ struct folio **out_folios,
+ unsigned long max_nr_folio,
u32 *cur_out,
const u32 sectorsize)
{
u32 sector_bytes_left;
u32 orig_out;
- struct page *cur_page;
+ struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -209,15 +209,15 @@ out:
return 0;
}
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
- struct page *page_in = NULL;
+ const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+ struct folio *folio_in = NULL;
char *sizes_ptr;
- const unsigned long max_nr_page = *out_pages;
+ const unsigned long max_nr_folio = *out_folios;
int ret = 0;
/* Points to the file offset of input data */
u64 cur_in = start;
@@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u32 cur_out = 0;
u32 len = *total_out;
- ASSERT(max_nr_page > 0);
- *out_pages = 0;
+ ASSERT(max_nr_folio > 0);
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -243,28 +243,29 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
size_t out_len;
/* Get the input page first */
- if (!page_in) {
- page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
- ASSERT(page_in);
+ if (!folio_in) {
+ ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in);
+ if (ret < 0)
+ goto out;
}
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_page(page_in);
+ data_in = kmap_local_folio(folio_in, 0);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
- if (ret < 0) {
- pr_debug("BTRFS: lzo in loop returned %d\n", ret);
+ if (unlikely(ret < 0)) {
+ /* lzo1x_1_compress never fails. */
ret = -EIO;
goto out;
}
ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
- pages, max_nr_page,
+ folios, max_nr_folio,
&cur_out, sectorsize);
if (ret < 0)
goto out;
@@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we have reached page boundary */
if (PAGE_ALIGNED(cur_in)) {
- put_page(page_in);
- page_in = NULL;
+ folio_put(folio_in);
+ folio_in = NULL;
}
}
/* Store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_page(pages[0]);
+ sizes_ptr = kmap_local_folio(folios[0], 0);
write_compress_length(sizes_ptr, cur_out);
kunmap_local(sizes_ptr);
@@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = cur_out;
*total_in = cur_in - start;
out:
- if (page_in)
- put_page(page_in);
- *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ if (folio_in)
+ folio_put(folio_in);
+ *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct page *cur_page;
+ struct folio *cur_folio;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
ASSERT(copy_len);
- cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
- memcpy_from_page(dest + *cur_in - orig_in, cur_page,
- offset_in_page(*cur_in), copy_len);
+ memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
+ offset_in_folio(cur_folio, *cur_in), copy_len);
*cur_in += copy_len;
}
@@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap_local_page(cb->compressed_pages[0]);
+ kaddr = kmap_local_folio(cb->compressed_folios[0], 0);
len_in = read_compress_length(kaddr);
kunmap_local(kaddr);
cur_in += LZO_LEN;
@@ -353,17 +354,20 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
* and all sectors should be used.
* If this happens, it means the compressed extent is corrupted.
*/
- if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
- round_up(len_in, sectorsize) < cb->compressed_len) {
+ if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
btrfs_err(fs_info,
- "invalid lzo header, lzo len %u compressed len %u",
- len_in, cb->compressed_len);
+"lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, len_in, cb->compressed_len);
return -EUCLEAN;
}
/* Go through each lzo segment */
while (cur_in < len_in) {
- struct page *cur_page;
+ struct folio *cur_folio;
/* Length of the compressed segment */
u32 seg_len;
u32 sector_bytes_left;
@@ -375,20 +379,24 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
- ASSERT(cur_page);
- kaddr = kmap_local_page(cur_page);
+ cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ ASSERT(cur_folio);
+ kaddr = kmap_local_folio(cur_folio, 0);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
/*
* seg_len shouldn't be larger than we have allocated
* for workspace->cbuf
*/
- btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
- seg_len);
+ btrfs_err(fs_info,
+ "lzo segment too big, root %llu inode %llu offset %llu len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, seg_len);
return -EIO;
}
@@ -398,8 +406,13 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Decompress the data */
ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- btrfs_err(fs_info, "failed to decompress");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start);
return -EIO;
}
@@ -425,16 +438,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,37 +464,26 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed!\n");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(dest_folio));
ret = -EIO;
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 7695decc7243..363fd28c0268 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -3,13 +3,11 @@
#include "fs.h"
#include "messages.h"
#include "discard.h"
-#include "transaction.h"
-#include "space-info.h"
#include "super.h"
#ifdef CONFIG_PRINTK
-#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_PREFACE " state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1)
/*
@@ -22,7 +20,8 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
- [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
+ [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
};
@@ -72,11 +71,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
-const char * __attribute_const__ btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int error)
{
char *errstr = "unknown";
- switch (errno) {
+ switch (error) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
@@ -110,12 +109,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno)
}
/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
+ * Decodes expected errors from the caller and invokes the appropriate error
+ * response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
@@ -132,11 +131,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Special case: if the error is EROFS, and we're already under
* SB_RDONLY, then it is safe here.
*/
- if (errno == -EROFS && sb_rdonly(sb))
+ if (error == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
@@ -147,11 +146,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.va = &args;
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ sb->s_id, statestr, function, line, error, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
- sb->s_id, statestr, function, line, errno, errstr);
+ sb->s_id, statestr, function, line, error, errstr);
}
#endif
@@ -159,7 +158,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Today we only save the error info to memory. Long term we'll also
* send it down to the disk.
*/
- WRITE_ONCE(fs_info->fs_error, errno);
+ WRITE_ONCE(fs_info->fs_error, error);
/* Don't go through full error handling during mount. */
if (!(sb->s_flags & SB_BORN))
@@ -240,7 +239,8 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
vaf.fmt = fmt;
vaf.va = &args;
- if (__ratelimit(ratelimit)) {
+ /* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) {
if (fs_info) {
char statestr[STATE_STRING_BUF_LEN];
@@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
#endif
/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
- * alert, and either panics or BUGs, depending on mount options.
+ * Decode unexpected, fatal errors from the caller, issue an alert, and either
+ * panic or BUGs, depending on mount options.
*/
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
@@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
+ s_id, function, line, &vaf, error, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
- function, line, &vaf, errno, errstr);
+ function, line, &vaf, error, errstr);
va_end(args);
/* Caller calls BUG() */
}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 1ae6f8e23e07..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -184,25 +184,25 @@ do { \
__printf(5, 6)
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+ unsigned int line, int error, const char *fmt, ...);
-const char * __attribute_const__ btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int error);
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args)
+ (error), fmt, ##args)
__printf(5, 6)
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
* will panic(). Otherwise we BUG() here.
*/
-#define btrfs_panic(fs_info, errno, fmt, args...) \
+#define btrfs_panic(fs_info, error, fmt, args...) \
do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \
BUG(); \
} while (0)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 40f2d9f1a17a..0d599fd847c9 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_MISC_H
#define BTRFS_MISC_H
+#include <linux/types.h>
+#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/math64.h>
@@ -64,7 +66,7 @@ struct rb_simple_node {
u64 bytenr;
};
-static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+static inline struct rb_node *rb_simple_search(const struct rb_root *root, u64 bytenr)
{
struct rb_node *node = root->rb_node;
struct rb_simple_node *entry;
@@ -91,7 +93,7 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
* Return the rb_node that start at or after @bytenr. If there is no entry at
* or after @bytner return NULL.
*/
-static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
u64 bytenr)
{
struct rb_node *node = root->rb_node, *ret = NULL;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8a3c46cb67f5..880f9553d79d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,7 +19,7 @@
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
-#include "super.h"
+#include "block-group.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
*/
-static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
- u64 file_offset)
+static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
+ u64 file_offset)
{
- struct rb_root *root = &tree->tree;
struct rb_node *prev = NULL;
struct rb_node *ret;
struct btrfs_ordered_extent *entry;
- if (tree->last) {
- entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ if (inode->ordered_tree_last) {
+ entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
rb_node);
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
- return tree->last;
+ return inode->ordered_tree_last;
}
- ret = __tree_search(root, file_offset, &prev);
+ ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
if (!ret)
ret = prev;
if (ret)
- tree->last = ret;
+ inode->ordered_tree_last = ret;
return ret;
}
@@ -154,9 +153,10 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
+ if (is_nocow) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
@@ -171,8 +171,13 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
return ERR_PTR(ret);
}
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
+ if (!entry) {
+ if (!is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(-ENOMEM);
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -181,7 +186,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = igrab(&inode->vfs_inode);
+ entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -192,6 +197,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
+ INIT_LIST_HEAD(&entry->bioc_list);
init_completion(&entry->completion);
/*
@@ -208,8 +214,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_inode *inode = entry->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -222,13 +227,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&tree->lock);
- node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
- if (node)
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = tree_insert(&inode->ordered_tree, entry->file_offset,
+ &entry->rb_node);
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -253,7 +259,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
@@ -264,17 +270,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
*/
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type)
+ const struct btrfs_file_extent *file_extent, unsigned long flags)
{
struct btrfs_ordered_extent *entry;
ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
- entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
- disk_bytenr, disk_num_bytes, offset, flags,
- compress_type);
+ /*
+ * For regular writes, we just use the members in @file_extent.
+ *
+ * For NOCOW, we don't really care about the numbers except @start and
+ * file_extent->num_bytes, as we won't insert a file extent item at all.
+ *
+ * For PREALLOC, we do not use ordered extent members, but
+ * btrfs_mark_extent_written() handles everything.
+ *
+ * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents,
+ * or btrfs_split_ordered_extent() cannot handle it correctly.
+ */
+ if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)))
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->num_bytes,
+ file_extent->disk_bytenr + file_extent->offset,
+ file_extent->num_bytes, 0, flags,
+ file_extent->compression);
+ else
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->ram_bytes,
+ file_extent->disk_bytenr,
+ file_extent->disk_num_bytes,
+ file_extent->offset, flags,
+ file_extent->compression);
if (!IS_ERR(entry))
insert_ordered_extent(entry);
return entry;
@@ -288,12 +316,17 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_inode *inode = entry->inode;
- tree = &BTRFS_I(entry->inode)->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
+}
+
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
+{
+ if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
}
static void finish_ordered_fn(struct btrfs_work *work)
@@ -305,18 +338,18 @@ static void finish_ordered_fn(struct btrfs_work *work)
}
static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset,
+ struct folio *folio, u64 file_offset,
u64 len, bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- lockdep_assert_held(&inode->ordered_tree.lock);
+ lockdep_assert_held(&inode->ordered_tree_lock);
- if (page) {
- ASSERT(page->mapping);
- ASSERT(page_offset(page) <= file_offset);
- ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
+ if (folio) {
+ ASSERT(folio->mapping);
+ ASSERT(folio_pos(folio) <= file_offset);
+ ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
* Ordered (Private2) bit indicates whether we still have
@@ -324,16 +357,16 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
- btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
}
/* Now we're fine to update the accounting. */
if (WARN_ON_ONCE(len > ordered->bytes_left)) {
btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
- inode->root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
len, ordered->bytes_left);
ordered->bytes_left = 0;
@@ -360,39 +393,70 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
fs_info->endio_freespace_worker : fs_info->endio_write_workers;
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
btrfs_queue_work(wq, &ordered->work);
}
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset, u64 len,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+ struct folio *folio, u64 file_offset, u64 len,
bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
unsigned long flags;
bool ret;
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree.lock, flags);
- ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
+ uptodate);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
if (ret)
btrfs_queue_ordered_fn(ordered);
- return ret;
}
/*
* Mark all ordered extents io inside the specified range finished.
*
- * @page: The involved page for the operation.
- * For uncompressed buffered IO, the page status also needs to be
+ * @folio: The involved folio for the operation.
+ * For uncompressed buffered IO, the folio status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
@@ -402,10 +466,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
* extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
+ struct folio *folio, u64 file_offset,
u64 num_bytes, bool uptodate)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
@@ -415,13 +478,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
file_offset + num_bytes - 1,
uptodate);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
while (cur < file_offset + num_bytes) {
u64 entry_end;
u64 end;
u32 len;
- node = tree_search(tree, cur);
+ node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
if (!node)
break;
@@ -467,14 +530,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
ASSERT(end + 1 - cur < U32_MAX);
len = end + 1 - cur;
- if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
- spin_unlock_irqrestore(&tree->lock, flags);
+ if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
}
cur += len;
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
}
/*
@@ -498,19 +561,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
if (cached && *cached) {
entry = *cached;
goto have_entry;
}
- node = tree_search(tree, file_offset);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -541,7 +603,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return finished;
}
@@ -554,14 +616,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
+ trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
- btrfs_add_delayed_iput(BTRFS_I(entry->inode));
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -579,7 +641,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = btrfs_inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -593,7 +654,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
- /* This is paired with btrfs_alloc_ordered_extent. */
+ /* This is paired with alloc_ordered_extent(). */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
@@ -612,16 +673,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- tree = &btrfs_inode->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
- rb_erase(node, &tree->tree);
+ rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ if (btrfs_inode->ordered_tree_last == node)
+ btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -680,11 +740,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
}
/*
- * wait for all the ordered extents in a root. This is done when balancing
- * space between drives.
+ * Wait for all the ordered extents in a root. Use @bg as range or do whole
+ * range if it's NULL.
*/
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(splice);
@@ -692,7 +752,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
u64 count = 0;
- const u64 range_end = range_start + range_len;
+ u64 range_start, range_len;
+ u64 range_end;
+
+ if (bg) {
+ range_start = bg->start;
+ range_len = bg->length;
+ } else {
+ range_start = 0;
+ range_len = U64_MAX;
+ }
+ range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
@@ -714,15 +784,15 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_run_ordered_extent_work, NULL, NULL);
+ btrfs_run_ordered_extent_work, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
cond_resched();
- spin_lock(&root->ordered_extent_lock);
if (nr != U64_MAX)
nr--;
count++;
+ spin_lock(&root->ordered_extent_lock);
}
list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
@@ -739,8 +809,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
return count;
}
+/*
+ * Wait for @nr ordered extents that intersect the @bg, or the whole range of
+ * the filesystem if @bg is NULL.
+ */
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_root *root;
LIST_HEAD(splice);
@@ -758,14 +832,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- done = btrfs_wait_ordered_extents(root, nr,
- range_start, range_len);
+ done = btrfs_wait_ordered_extents(root, nr, bg);
btrfs_put_root(root);
- spin_lock(&fs_info->ordered_root_lock);
- if (nr != U64_MAX) {
+ if (nr != U64_MAX)
nr -= done;
- }
+
+ spin_lock(&fs_info->ordered_root_lock);
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
@@ -782,7 +855,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
@@ -809,7 +882,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len)
{
int ret = 0;
int ret_wb = 0;
@@ -839,11 +912,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* before the ordered extents complete - to avoid failures (-EEXIST)
* when adding the new ordered extents to the ordered tree.
*/
- ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end);
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -878,14 +951,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- tree = &inode->ordered_tree;
- spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, file_offset);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -897,7 +968,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return entry;
}
@@ -907,15 +978,13 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node) {
- node = tree_search(tree, file_offset + len);
+ node = ordered_tree_search(inode, file_offset + len);
if (!node)
goto out;
}
@@ -939,7 +1008,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -950,13 +1019,12 @@ out:
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *n;
- ASSERT(inode_is_locked(&inode->vfs_inode));
+ btrfs_assert_inode_locked(inode);
- spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ spin_lock_irq(&inode->ordered_tree_lock);
+ for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
@@ -969,7 +1037,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
/*
@@ -979,13 +1047,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -993,7 +1059,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1009,15 +1075,14 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct rb_node *cur;
struct rb_node *prev;
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&tree->lock);
- node = tree->tree.rb_node;
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
* and screw up the search order.
@@ -1071,7 +1136,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1149,8 +1214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
@@ -1171,6 +1235,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
@@ -1189,15 +1265,32 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/* One ref for the tree. */
refcount_inc(&new->refs);
+ /*
+ * Take the root's ordered_extent_lock to avoid a race with
+ * btrfs_wait_ordered_extents() when updating the disk_bytenr and
+ * disk_num_bytes fields of the ordered extent below. And we disable
+ * IRQs because the inode's ordered_tree_lock is used in IRQ context
+ * elsewhere.
+ *
+ * There's no concern about a previous caller of
+ * btrfs_wait_ordered_extents() getting the trimmed ordered extent
+ * before we insert the new one, because even if it gets the ordered
+ * extent before it's trimmed and the new one inserted, right before it
+ * uses it or during its use, the ordered extent might have been
+ * trimmed in the meanwhile, and it missed the new ordered extent.
+ * There's no way around this and it's harmless for current use cases,
+ * so we take the root's ordered_extent_lock to fix that race during
+ * trimming and silence tools like KCSAN.
+ */
spin_lock_irq(&root->ordered_extent_lock);
- spin_lock(&tree->lock);
- /* Remove from tree once */
- node = &ordered->rb_node;
- rb_erase(node, &tree->tree);
- RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ spin_lock(&inode->ordered_tree_lock);
+ /*
+ * We don't have overlapping ordered extents (that would imply double
+ * allocation of extents) and we checked above that the split length
+ * does not cross the ordered extent's num_bytes field, so there's
+ * no need to remove it and re-insert it in the tree.
+ */
ordered->file_offset += len;
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
@@ -1227,19 +1320,12 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
offset += sum->len;
}
- /* Re-insert the node */
- node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
- if (node)
- btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
- ordered->file_offset);
-
- node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
- if (node)
+ node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
+ "inconsistency in ordered tree at offset %llu after split",
new->file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock(&inode->ordered_tree_lock);
list_add_tail(&new->root_extent_list, &root->ordered_extents);
root->nr_ordered_extents++;
@@ -1249,10 +1335,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
int __init ordered_data_init(void)
{
- btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
- sizeof(struct btrfs_ordered_extent), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 173bd5c5df26..4e152736d06c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,12 +6,20 @@
#ifndef BTRFS_ORDERED_DATA_H
#define BTRFS_ORDERED_DATA_H
-/* one of these per inode */
-struct btrfs_ordered_inode_tree {
- spinlock_t lock;
- struct rb_root tree;
- struct rb_node *last;
-};
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
+#include "async-thread.h"
+
+struct inode;
+struct page;
+struct extent_state;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_fs_info;
struct btrfs_ordered_sum {
/*
@@ -104,13 +112,6 @@ struct btrfs_ordered_extent {
u64 bytes_left;
/*
- * the end of the ordered extent which is behind it but
- * didn't update disk_i_size. Please see the comment of
- * btrfs_ordered_update_i_size();
- */
- u64 outstanding_isize;
-
- /*
* If we get truncated we need to adjust the file extent we enter for
* this ordered extent so that we do not expose stale data.
*/
@@ -129,7 +130,7 @@ struct btrfs_ordered_extent {
refcount_t refs;
/* the inode we belong to */
- struct inode *inode;
+ struct btrfs_inode *inode;
/* list of checksums for insertion when the extent io is done */
struct list_head list;
@@ -151,15 +152,9 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
-};
-static inline void
-btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
-{
- spin_lock_init(&t->lock);
- t->tree = RB_ROOT;
- t->last = NULL;
-}
+ struct list_head bioc_list;
+};
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
@@ -167,26 +162,37 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset, u64 len,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+ struct folio *folio, u64 file_offset, u64 len,
bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
- u64 num_bytes, bool uptodate);
+ struct folio *folio, u64 file_offset,
+ u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+
+/*
+ * This represents details about the target file extent item of a write operation.
+ */
+struct btrfs_file_extent {
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 num_bytes;
+ u64 ram_bytes;
+ u64 offset;
+ u8 compression;
+};
+
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type);
+ const struct btrfs_file_extent *file_extent, unsigned long flags);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
@@ -198,9 +204,9 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
@@ -208,6 +214,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len);
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 7a1b021b5669..9f3ad124104f 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -4,15 +4,13 @@
*/
#include "ctree.h"
-#include "disk-io.h"
#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
- int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -22,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-
- btrfs_free_path(path);
- return ret;
+ return btrfs_insert_empty_item(trans, root, path, &key, 0);
}
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -45,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
- if (ret) { /* JDM: Really? */
- ret = -ENOENT;
- goto out;
- }
-
- ret = btrfs_del_item(trans, root, path);
+ return ret;
+ if (ret)
+ return -ENOENT;
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
index 3faab5cbb59a..aa54a88a60de 100644
--- a/fs/btrfs/orphan.h
+++ b/fs/btrfs/orphan.h
@@ -3,6 +3,11 @@
#ifndef BTRFS_ORPHAN_H
#define BTRFS_ORPHAN_H
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 815a5fc3ff9d..fc821aa446f0 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,8 @@
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
+#include "volumes.h"
+#include "raid-stripe-tree.h"
struct root_name_map {
u64 id;
@@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
{ BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+ { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" },
};
const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
+static void print_extent_owner_ref(const struct extent_buffer *eb,
+ const struct btrfs_extent_owner_ref *ref)
+{
+ ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA));
+ pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref));
+}
+
static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
@@ -98,7 +109,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
btrfs_err(eb->fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL);
+ return;
}
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
@@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
offset, eb->fs_info->sectorsize);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ print_extent_owner_ref(eb, oref);
+ break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
eb->start, type);
@@ -189,6 +204,17 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
}
}
+static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
+ struct btrfs_stripe_extent *stripe)
+{
+ const int num_stripes = btrfs_num_raid_stripes(item_size);
+
+ for (int i = 0; i < num_stripes; i++)
+ pr_info("\t\t\tstride %d devid %llu physical %llu\n",
+ i, btrfs_raid_stride_devid(eb, &stripe->strides[i]),
+ btrfs_raid_stride_physical(eb, &stripe->strides[i]));
+}
+
/*
* Helper to output refs and locking status of extent buffer. Useful to debug
* race condition related problems.
@@ -279,6 +305,9 @@ void btrfs_print_leaf(const struct extent_buffer *l)
case BTRFS_EXTENT_DATA_KEY:
fi = btrfs_item_ptr(l, i,
struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(l, fi),
+ btrfs_file_extent_type(l, fi));
if (btrfs_file_extent_type(l, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
pr_info("\t\tinline extent data size %llu\n",
@@ -349,6 +378,10 @@ void btrfs_print_leaf(const struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size(l, i));
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ print_raid_stripe_key(l, btrfs_item_size(l, i),
+ btrfs_item_ptr(l, i, struct btrfs_stripe_extent));
+ break;
}
}
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index c42bc666d5ee..8504bf1702c7 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -9,6 +9,9 @@
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
+struct extent_buffer;
+struct btrfs_key;
+
void btrfs_print_leaf(const struct extent_buffer *l);
void btrfs_print_tree(const struct extent_buffer *c, bool follow);
const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 0755af0e53e3..b8fa34e16abb 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,6 +4,7 @@
*/
#include <linux/hashtable.h>
+#include <linux/xattr.h>
#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
@@ -15,6 +16,7 @@
#include "fs.h"
#include "accessors.h"
#include "super.h"
+#include "dir-item.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
@@ -25,7 +27,7 @@ struct prop_handler {
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(struct inode *inode);
+ const char *(*extract)(const struct inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -102,7 +104,7 @@ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
return handler->ignore(inode);
}
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags)
{
@@ -114,29 +116,29 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
return -EINVAL;
if (value_len == 0) {
- ret = btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name,
NULL, 0, flags);
if (ret)
return ret;
- ret = handler->apply(inode, NULL, 0);
+ ret = handler->apply(&inode->vfs_inode, NULL, 0);
ASSERT(ret == 0);
return ret;
}
- ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(inode, value, value_len);
+ ret = handler->apply(&inode->vfs_inode, value, value_len);
if (ret) {
- btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+ btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
return ret;
}
- set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
return 0;
}
@@ -266,7 +268,7 @@ static void inode_prop_iterator(void *ctx,
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
handler->xattr_name, btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
else
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
@@ -301,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
static int prop_compression_apply(struct inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
int type;
/* Reset to defaults */
@@ -357,7 +359,7 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(struct inode *inode)
+static const char *prop_compression_extract(const struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
@@ -383,7 +385,7 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *parent)
+ struct inode *inode, const struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 6e283196e38a..63546d0a9444 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,11 +6,16 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
-#include "ctree.h"
+#include <linux/compiler_types.h>
+
+struct inode;
+struct btrfs_inode;
+struct btrfs_path;
+struct btrfs_trans_handle;
int __init btrfs_props_init(void);
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
@@ -21,6 +26,6 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
- struct inode *dir);
+ const struct inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1b9f4f16d124..6b181bf9f156 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,6 +30,25 @@
#include "root-tree.h"
#include "tree-checker.h"
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info)
+{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return BTRFS_QGROUP_MODE_DISABLED;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
+ return BTRFS_QGROUP_MODE_SIMPLE;
+ return BTRFS_QGROUP_MODE_FULL;
+}
+
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
+}
+
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
+}
+
/*
* Helpers to access qgroup reservation
*
@@ -88,7 +107,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -98,7 +117,7 @@ static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -122,47 +141,27 @@ static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
qg->new_refcnt += mod;
}
-static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->old_refcnt < seq)
return 0;
return qg->old_refcnt - seq;
}
-static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->new_refcnt < seq)
return 0;
return qg->new_refcnt - seq;
}
-/*
- * glue structure to represent the relations between qgroups.
- */
-struct btrfs_qgroup_list {
- struct list_head next_group;
- struct list_head next_member;
- struct btrfs_qgroup *group;
- struct btrfs_qgroup *member;
-};
-
-static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
-{
- return (u64)(uintptr_t)qg;
-}
-
-static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
-{
- return (struct btrfs_qgroup *)(uintptr_t)n->aux;
-}
-
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
-static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct rb_node *n = fs_info->qgroup_tree.rb_node;
@@ -180,35 +179,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
return NULL;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add qgroup to the filesystem's qgroup tree.
+ *
+ * Must be called with qgroup_lock held and @prealloc preallocated.
+ *
+ * The control on the lifespan of @prealloc would be transferred to this
+ * function, thus caller should no longer touch @prealloc.
+ */
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
struct rb_node *parent = NULL;
struct btrfs_qgroup *qgroup;
+ /* Caller must have pre-allocated @prealloc. */
+ ASSERT(prealloc);
+
while (*p) {
parent = *p;
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
+ if (qgroup->qgroupid < qgroupid) {
p = &(*p)->rb_left;
- else if (qgroup->qgroupid > qgroupid)
+ } else if (qgroup->qgroupid > qgroupid) {
p = &(*p)->rb_right;
- else
+ } else {
+ kfree(prealloc);
return qgroup;
+ }
}
- qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
- if (!qgroup)
- return ERR_PTR(-ENOMEM);
-
+ qgroup = prealloc;
qgroup->qgroupid = qgroupid;
INIT_LIST_HEAD(&qgroup->groups);
INIT_LIST_HEAD(&qgroup->members);
INIT_LIST_HEAD(&qgroup->dirty);
INIT_LIST_HEAD(&qgroup->iterator);
+ INIT_LIST_HEAD(&qgroup->nested_iterator);
rb_link_node(&qgroup->node, parent, p);
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
@@ -255,27 +265,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
/*
* Add relation specified by two qgroups.
*
- * Must be called with qgroup_lock held.
+ * Must be called with qgroup_lock held, the ownership of @prealloc is
+ * transferred to this function and caller should not touch it anymore.
*
* Return: 0 on success
* -ENOENT if one of the qgroups is NULL
* <0 other errors
*/
-static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
+ struct btrfs_qgroup *member,
+ struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup_list *list;
-
- if (!member || !parent)
+ if (!member || !parent) {
+ kfree(prealloc);
return -ENOENT;
+ }
- list = kzalloc(sizeof(*list), GFP_ATOMIC);
- if (!list)
- return -ENOMEM;
-
- list->group = parent;
- list->member = member;
- list_add_tail(&list->next_group, &member->groups);
- list_add_tail(&list->next_member, &parent->members);
+ prealloc->group = parent;
+ prealloc->member = member;
+ list_add_tail(&prealloc->next_group, &member->groups);
+ list_add_tail(&prealloc->next_member, &parent->members);
return 0;
}
@@ -289,7 +298,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
* -ENOENT if one of the ids does not exist
* <0 other errors
*/
-static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_list *prealloc,
+ u64 memberid, u64 parentid)
{
struct btrfs_qgroup *member;
struct btrfs_qgroup *parent;
@@ -297,7 +308,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
- return __add_relation_rb(member, parent);
+ return __add_relation_rb(prealloc, member, parent);
}
/* Must be called with qgroup_lock held */
@@ -325,7 +336,7 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl)
{
struct btrfs_qgroup *qgroup;
@@ -341,11 +352,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
{
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
}
+static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot,
+ struct btrfs_qgroup_status_item *ptr)
+{
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
+ fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -362,7 +384,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
u64 flags = 0;
u64 rescan_progress = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!fs_info->quota_root)
return 0;
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
@@ -412,14 +434,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
"old qgroup version, quota disabled");
goto out;
}
- if (btrfs_qgroup_status_generation(l, ptr) !=
- fs_info->generation) {
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ qgroup_read_enable_gen(fs_info, l, slot, ptr);
+ } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
- fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
- ptr);
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -435,11 +457,34 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
+ struct btrfs_qgroup *prealloc;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ /*
+ * If a qgroup exists for a subvolume ID, it is possible
+ * that subvolume has been deleted, in which case
+ * re-using that ID would lead to incorrect accounting.
+ *
+ * Ensure that we skip any such subvol ids.
+ *
+ * We don't need to lock because this is only called
+ * during mount before we start doing things like creating
+ * subvolumes.
+ */
+ if (is_fstree(qgroup->qgroupid) &&
+ qgroup->qgroupid > tree_root->free_objectid)
+ /*
+ * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
+ * as it will get checked on the next call to
+ * btrfs_get_free_objectid.
+ */
+ tree_root->free_objectid = qgroup->qgroupid + 1;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
@@ -490,6 +535,8 @@ next1:
if (ret)
goto out;
while (1) {
+ struct btrfs_qgroup_list *list = NULL;
+
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
@@ -503,8 +550,14 @@ next1:
goto next2;
}
- ret = add_relation_rb(fs_info, found_key.objectid,
+ list = kzalloc(sizeof(*list), GFP_KERNEL);
+ if (!list) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_relation_rb(fs_info, list, found_key.objectid,
found_key.offset);
+ list = NULL;
if (ret == -ENOENT) {
btrfs_warn(fs_info,
"orphan qgroup relation 0x%llx->0x%llx",
@@ -523,13 +576,12 @@ next2:
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
- if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
- ret >= 0)
- ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-
- if (ret < 0) {
+ if (ret >= 0) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+ } else {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -546,12 +598,12 @@ out:
* Return false if no reserved space is left.
* Return true if some reserved space is leaked.
*/
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
bool ret = false;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
return ret;
/*
* Since we're unmounting, there is no race and no need to grab qgroup
@@ -950,7 +1002,8 @@ out:
return ret;
}
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
{
struct btrfs_root *quota_root;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -960,8 +1013,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
+ struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
struct ulist *ulist = NULL;
+ const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1064,8 +1119,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
- fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
+ if (simple) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
+ btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+ } else {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
@@ -1095,6 +1156,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
+ /* We should not have a stray @prealloc pointer. */
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1102,7 +1172,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ prealloc = NULL;
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
btrfs_abort_transaction(trans, ret);
@@ -1145,18 +1216,22 @@ out_add_root:
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out_free_path;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
+ prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ fs_info->qgroup_enable_gen = trans->transid;
+
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
@@ -1183,6 +1258,10 @@ out_add_root:
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
+ /* Skip rescan for simple qgroups. */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ goto out_free_path;
+
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
@@ -1223,9 +1302,37 @@ out:
else if (trans)
ret = btrfs_end_transaction(trans);
ulist_free(ulist);
+ kfree(prealloc);
return ret;
}
+/*
+ * It is possible to have outstanding ordered extents which reserved bytes
+ * before we disabled. We need to fully flush delalloc, ordered extents, and a
+ * commit to ensure that we don't leak such reservations, only to have them
+ * come back if we re-enable.
+ *
+ * - enable simple quotas
+ * - reserve space
+ * - release it, store rsv_bytes in OE
+ * - disable quotas
+ * - enable simple quotas (qgroup rsv are all 0)
+ * - OE finishes
+ * - run delayed refs
+ * - free rsv_bytes, resulting in miscounting or even underflow
+ */
+static int flush_reservations(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
+
+ return btrfs_commit_current_transaction(fs_info->tree_root);
+}
+
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *quota_root = NULL;
@@ -1239,16 +1346,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
lockdep_assert_held_write(&fs_info->subvol_sem);
/*
- * Lock the cleaner mutex to prevent races with concurrent relocation,
- * because relocation may be building backrefs for blocks of the quota
- * root while we are deleting the root. This is like dropping fs roots
- * of deleted snapshots/subvolumes, we need the same protection.
- *
- * This also prevents races between concurrent tasks trying to disable
- * quotas, because we will unlock and relock qgroup_ioctl_lock across
- * BTRFS_FS_QUOTA_ENABLED changes.
+ * Relocation will mess with backrefs, so make sure we have the
+ * cleaner_mutex held to protect us from relocate.
*/
- mutex_lock(&fs_info->cleaner_mutex);
+ lockdep_assert_held(&fs_info->cleaner_mutex);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
@@ -1271,6 +1372,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_qgroup_wait_for_completion(fs_info, false);
/*
+ * We have nothing held here and no trans handle, just return the error
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
+ */
+ ret = flush_reservations(fs_info);
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ return ret;
+ }
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
@@ -1296,7 +1408,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -1332,9 +1445,7 @@ out:
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
- ret = btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->cleaner_mutex);
-
+ ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1377,14 +1488,11 @@ static void qgroup_iterator_clean(struct list_head *head)
*
* Caller should hold fs_info->qgroup_lock.
*/
-static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 ref_root,
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup_list *glist;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
int ret = 0;
@@ -1392,53 +1500,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
if (!qgroup)
goto out;
- qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
-
- WARN_ON(sign < 0 && qgroup->excl < num_bytes);
- qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
-
- if (sign > 0)
- qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
- else
- qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-
- qgroup_dirty(fs_info, qgroup);
-
- /* Get all of the parent groups that contain this qgroup */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
- /* Iterate all of the parents and adjust their reference counts */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = unode_aux_to_qgroup(unode);
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
+
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
else
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
- qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ /* Append parent qgroups to @qgroup_list. */
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
ret = 0;
out:
+ qgroup_iterator_clean(&qgroup_list);
return ret;
}
@@ -1455,24 +1540,19 @@ out:
* Return < 0 for other error.
*/
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 src, u64 dst,
- int sign)
+ u64 src, u64 dst, int sign)
{
struct btrfs_qgroup *qgroup;
int ret = 1;
- int err = 0;
qgroup = find_qgroup_rb(fs_info, src);
if (!qgroup)
goto out;
if (qgroup->excl == qgroup->rfer) {
- ret = 0;
- err = __qgroup_excl_accounting(fs_info, tmp, dst,
- qgroup, sign);
- if (err < 0) {
- ret = err;
+ ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
+ if (ret < 0)
goto out;
- }
+ ret = 0;
}
out:
if (ret)
@@ -1480,28 +1560,25 @@ out:
return ret;
}
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst)
+/*
+ * Add relation between @src and @dst qgroup. The @prealloc is allocated by the
+ * callers and transferred here (either used or freed on error).
+ */
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
- unsigned int nofs_flag;
int ret = 0;
+ ASSERT(prealloc);
+
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1533,16 +1610,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = __add_relation_rb(member, parent);
+ ret = __add_relation_rb(prealloc, member, parent);
+ prealloc = NULL;
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
}
- ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+ ret = quick_update_accounting(fs_info, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
+ kfree(prealloc);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
- ulist_free(tmp);
return ret;
}
@@ -1553,19 +1631,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
bool found = false;
- unsigned int nofs_flag;
int ret = 0;
int ret2;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
@@ -1603,11 +1672,10 @@ delete_item:
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ ret = quick_update_accounting(fs_info, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out:
- ulist_free(tmp);
return ret;
}
@@ -1629,6 +1697,7 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1643,31 +1712,77 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = add_qgroup_item(trans, quota_root, qgroupid);
if (ret)
goto out;
spin_lock(&fs_info->qgroup_lock);
- qgroup = add_qgroup_rb(fs_info, qgroupid);
+ qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+ prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- goto out;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ kfree(prealloc);
return ret;
}
-static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+/*
+ * Return 0 if we can not delete the qgroup (not empty or has children etc).
+ * Return >0 if we can delete the qgroup.
+ * Return <0 for other errors during tree search.
+ */
+static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
- return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
- qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ /*
+ * Squota would never be inconsistent, but there can still be case
+ * where a dropped subvolume still has qgroup numbers, and squota
+ * relies on such qgroup for future accounting.
+ *
+ * So for squota, do not allow dropping any non-zero qgroup.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
+ (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
+ return 0;
+
+ /* For higher level qgroup, we can only delete it if it has no child. */
+ if (btrfs_qgroup_level(qgroup->qgroupid)) {
+ if (!list_empty(&qgroup->members))
+ return 0;
+ return 1;
+ }
+
+ /*
+ * For level-0 qgroups, we can only delete it if it has no subvolume
+ * for it.
+ * This means even a subvolume is unlinked but not yet fully dropped,
+ * we can not delete the qgroup.
+ */
+ key.objectid = qgroup->qgroupid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = -1ULL;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+ btrfs_free_path(path);
+ /*
+ * The @ret from btrfs_find_root() exactly matches our definition for
+ * the return value, thus can be returned directly.
+ */
+ return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1689,7 +1804,10 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
- if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = can_delete_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
ret = -EBUSY;
goto out;
}
@@ -1714,6 +1832,45 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
}
spin_lock(&fs_info->qgroup_lock);
+ /*
+ * Warn on reserved space. The subvolume should has no child nor
+ * corresponding subvolume.
+ * Thus its reserved space should all be zero, no matter if qgroup
+ * is consistent or the mode.
+ */
+ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+
+ }
+ /*
+ * The same for rfer/excl numbers, but that's only if our qgroup is
+ * consistent and if it's in regular qgroup mode.
+ * For simple mode it's not as accurate thus we can hit non-zero values
+ * very frequently.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ if (qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
+ qgroup_mark_inconsistent(fs_info);
+ }
+ }
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
@@ -1729,6 +1886,40 @@ out:
return ret;
}
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ return 0;
+
+ /*
+ * Commit current transaction to make sure all the rfer/excl numbers
+ * get updated.
+ */
+ ret = btrfs_commit_current_transaction(fs_info->quota_root);
+ if (ret < 0)
+ return ret;
+
+ /* Start new trans to delete the qgroup info and limit items. */
+ trans = btrfs_start_transaction(fs_info->quota_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_remove_qgroup(trans, subvolid);
+ btrfs_end_transaction(trans);
+ /*
+ * It's squota and the subvolume still has numbers needed for future
+ * accounting, in this case we can not delete it. Just skip it.
+ *
+ * Or the qgroup is already removed by a qgroup rescan. For both cases we're
+ * safe to ignore them.
+ */
+ if (ret == -EBUSY || ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
@@ -1806,47 +1997,90 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Return <0 for insertion failure, caller can free @record safely.
+ */
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
{
- struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_qgroup_extent_record *entry;
- u64 bytenr = record->bytenr;
+ struct btrfs_qgroup_extent_record *existing, *ret;
+ const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);
+
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 1;
+
+#if BITS_PER_LONG == 32
+ if (record->bytenr >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
+ record->bytenr);
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+ }
+#endif
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
- node);
- if (bytenr < entry->bytenr) {
- p = &(*p)->rb_left;
- } else if (bytenr > entry->bytenr) {
- p = &(*p)->rb_right;
- } else {
- if (record->data_rsv && !entry->data_rsv) {
- entry->data_rsv = record->data_rsv;
- entry->data_rsv_refroot =
- record->data_rsv_refroot;
- }
- return 1;
+ xa_lock(&delayed_refs->dirty_extents);
+ existing = xa_load(&delayed_refs->dirty_extents, index);
+ if (existing) {
+ if (record->data_rsv && !existing->data_rsv) {
+ existing->data_rsv = record->data_rsv;
+ existing->data_rsv_refroot = record->data_rsv_refroot;
}
+ xa_unlock(&delayed_refs->dirty_extents);
+ return 1;
+ }
+
+ ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
+ xa_unlock(&delayed_refs->dirty_extents);
+ if (xa_is_err(ret)) {
+ qgroup_mark_inconsistent(fs_info);
+ return xa_err(ret);
}
- rb_link_node(&record->node, parent_node, p);
- rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
return 0;
}
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
+ if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ return 0;
/*
* We are always called in a context where we are already holding a
* transaction handle. Often we are called when adding a data delayed
@@ -1894,21 +2128,39 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
- || bytenr == 0 || num_bytes == 0)
+ if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
return 0;
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
return -ENOMEM;
+ if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
+ kfree(record);
+ return -ENOMEM;
+ }
+
delayed_refs = &trans->transaction->delayed_refs;
record->bytenr = bytenr;
record->num_bytes = num_bytes;
@@ -1917,13 +2169,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
spin_lock(&delayed_refs->lock);
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
- if (ret > 0) {
+ if (ret) {
+ /* Clean up if insertion fails or item exists. */
+ xa_release(&delayed_refs->dirty_extents, index);
kfree(record);
return 0;
}
return btrfs_qgroup_trace_extent_post(trans, record);
}
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
@@ -1935,7 +2195,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
for (i = 0; i < nr; i++) {
@@ -2311,7 +2571,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
int level;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/* Wrong parameter order */
@@ -2354,6 +2614,16 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2365,10 +2635,10 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
- BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
- BUG_ON(root_eb == NULL);
+ ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
+ ASSERT(root_eb != NULL);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
spin_lock(&fs_info->qgroup_lock);
@@ -2480,62 +2750,64 @@ out:
return ret;
}
+static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+ if (!list_empty(&qgroup->nested_iterator))
+ return;
+
+ list_add_tail(&qgroup->nested_iterator, head);
+}
+
+static void qgroup_iterator_nested_clean(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
+ list_del_init(&qgroup->nested_iterator);
+ }
+}
+
#define UPDATE_NEW 0
#define UPDATE_OLD 1
/*
* Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- struct ulist *qgroups, u64 seq, int update_old)
+static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct list_head *qgroups,
+ u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret = 0;
if (!roots)
- return 0;
+ return;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ LIST_HEAD(tmp);
+
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ qgroup_iterator_nested_add(qgroups, qg);
+ qgroup_iterator_add(&tmp, qg);
+ list_for_each_entry(qg, &tmp, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(tmp_unode);
if (update_old)
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qgroup_iterator_nested_add(qgroups, glist->group);
+ qgroup_iterator_add(&tmp, glist->group);
}
}
+ qgroup_iterator_clean(&tmp);
}
- return 0;
}
/*
@@ -2574,22 +2846,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* But this time we don't need to consider other things, the codes and logic
* is easy to understand now.
*/
-static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
- struct ulist *qgroups,
- u64 nr_old_roots,
- u64 nr_new_roots,
- u64 num_bytes, u64 seq)
+static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct list_head *qgroups, u64 nr_old_roots,
+ u64 nr_new_roots, u64 num_bytes, u64 seq)
{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- u64 cur_new_count, cur_old_count;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(qgroups, &uiter))) {
+ list_for_each_entry(qg, qgroups, nested_iterator) {
+ u64 cur_new_count, cur_old_count;
bool dirty = false;
- qg = unode_aux_to_qgroup(unode);
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
@@ -2660,7 +2926,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
if (dirty)
qgroup_dirty(fs_info, qg);
}
- return 0;
}
/*
@@ -2697,8 +2962,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct ulist *qgroups = NULL;
- struct ulist *tmp = NULL;
+ LIST_HEAD(qgroups);
u64 seq;
u64 nr_new_roots = 0;
u64 nr_old_roots = 0;
@@ -2708,7 +2972,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (!btrfs_qgroup_full_accounting(fs_info) ||
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
@@ -2730,17 +2994,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups) {
- ret = -ENOMEM;
- goto out_free;
- }
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out_free;
- }
-
mutex_lock(&fs_info->qgroup_rescan_lock);
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
@@ -2755,29 +3008,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
seq = fs_info->qgroup_seq;
/* Update old refcnts using old_roots */
- ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
- UPDATE_OLD);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
/* Update new refcnts using new_roots */
- ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
- UPDATE_NEW);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
- qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
num_bytes, seq);
/*
+ * We're done using the iterator, release all its qgroups while holding
+ * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+ * and trigger use-after-free accesses to qgroups.
+ */
+ qgroup_iterator_nested_clean(&qgroups);
+
+ /*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
-out:
spin_unlock(&fs_info->qgroup_lock);
out_free:
- ulist_free(tmp);
- ulist_free(qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
@@ -2789,17 +3040,17 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
- struct rb_node *node;
+ unsigned long index;
u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
- while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
- record = rb_entry(node, struct btrfs_qgroup_extent_record,
- node);
-
+ xa_for_each(&delayed_refs->dirty_extents, index, record) {
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
@@ -2865,7 +3116,7 @@ cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
new_roots = NULL;
- rb_erase(node, &delayed_refs->dirty_extent_root);
+ xa_erase(&delayed_refs->dirty_extents, index);
kfree(record);
}
@@ -2909,7 +3160,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_enabled(fs_info))
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
else
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -2922,6 +3173,159 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
return ret;
}
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size)
+{
+ if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
+ return -EOPNOTSUPP;
+ if (size < sizeof(*inherit) || size > PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * In the past we allowed btrfs_qgroup_inherit to specify to copy
+ * rfer/excl numbers directly from other qgroups. This behavior has
+ * been disabled in userspace for a very long time, but here we should
+ * also disable it in kernel, as this behavior is known to mark qgroup
+ * inconsistent, and a rescan would wipe out the changes anyway.
+ *
+ * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
+ */
+ if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
+ return -EINVAL;
+
+ if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
+ return -EINVAL;
+
+ /*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
+ * Now check all the remaining qgroups, they should all:
+ *
+ * - Exist
+ * - Be higher level qgroups.
+ */
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ struct btrfs_qgroup *qgroup;
+ u64 qgroupid = inherit->qgroups[i];
+
+ if (btrfs_qgroup_level(qgroupid) == 0)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+ }
+ return 0;
+}
+
+static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
+ u64 inode_rootid,
+ struct btrfs_qgroup_inherit **inherit)
+{
+ int i = 0;
+ u64 num_qgroups = 0;
+ struct btrfs_qgroup *inode_qg;
+ struct btrfs_qgroup_list *qg_list;
+ struct btrfs_qgroup_inherit *res;
+ size_t struct_sz;
+ u64 *qgids;
+
+ if (*inherit)
+ return -EEXIST;
+
+ inode_qg = find_qgroup_rb(fs_info, inode_rootid);
+ if (!inode_qg)
+ return -ENOENT;
+
+ num_qgroups = list_count_nodes(&inode_qg->groups);
+
+ if (!num_qgroups)
+ return 0;
+
+ struct_sz = struct_size(res, qgroups, num_qgroups);
+ if (struct_sz == SIZE_MAX)
+ return -ERANGE;
+
+ res = kzalloc(struct_sz, GFP_NOFS);
+ if (!res)
+ return -ENOMEM;
+ res->num_qgroups = num_qgroups;
+ qgids = res->qgroups;
+
+ list_for_each_entry(qg_list, &inode_qg->groups, next_group)
+ qgids[i++] = qg_list->group->qgroupid;
+
+ *inherit = res;
+ return 0;
+}
+
+/*
+ * Check if we can skip rescan when inheriting qgroups. If @src has a single
+ * @parent, and that @parent is owning all its bytes exclusively, we can skip
+ * the full rescan, by just adding nodesize to the @parent's excl/rfer.
+ *
+ * Return <0 for fatal errors (like srcid/parentid has no qgroup).
+ * Return 0 if a quick inherit is done.
+ * Return >0 if a quick inherit is not possible, and a full rescan is needed.
+ */
+static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
+ u64 srcid, u64 parentid)
+{
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+ int nr_parents = 0;
+
+ src = find_qgroup_rb(fs_info, srcid);
+ if (!src)
+ return -ENOENT;
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!parent)
+ return -ENOENT;
+
+ /*
+ * Source has no parent qgroup, but our new qgroup would have one.
+ * Qgroup numbers would become inconsistent.
+ */
+ if (list_empty(&src->groups))
+ return 1;
+
+ list_for_each_entry(list, &src->groups, next_group) {
+ /* The parent is not the same, quick update is not possible. */
+ if (list->group->qgroupid != parentid)
+ return 1;
+ nr_parents++;
+ /*
+ * More than one parent qgroup, we can't be sure about accounting
+ * consistency.
+ */
+ if (nr_parents > 1)
+ return 1;
+ }
+
+ /*
+ * The parent is not exclusively owning all its bytes. We're not sure
+ * if the source has any bytes not fully owned by the parent.
+ */
+ if (parent->excl != parent->rfer)
+ return 1;
+
+ parent->excl += fs_info->nodesize;
+ parent->rfer += fs_info->nodesize;
+ return 0;
+}
+
/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
@@ -2929,20 +3333,27 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
* when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit)
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
- int i;
u64 *i_qgroups;
bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ struct btrfs_qgroup *prealloc;
+ struct btrfs_qgroup_list **qlist_prealloc = NULL;
+ bool free_inherit = false;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc)
+ return -ENOMEM;
+
/*
* There are only two callers of this function.
*
@@ -2962,7 +3373,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_enabled(fs_info))
goto out;
quota_root = fs_info->quota_root;
@@ -2971,11 +3382,18 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
+ ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
+ if (ret)
+ goto out;
+ free_inherit = true;
+ }
+
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2 * inherit->num_excl_copies;
- for (i = 0; i < nums; ++i) {
+ for (int i = 0; i < nums; i++) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
/*
@@ -3002,7 +3420,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, objectid,
@@ -3015,16 +3433,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
ret = 0;
- }
+ qlist_prealloc = kcalloc(inherit->num_qgroups,
+ sizeof(struct btrfs_qgroup_list *),
+ GFP_NOFS);
+ if (!qlist_prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
+ GFP_NOFS);
+ if (!qlist_prealloc[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
spin_lock(&fs_info->qgroup_lock);
- dstgroup = add_qgroup_rb(fs_info, objectid);
- if (IS_ERR(dstgroup)) {
- ret = PTR_ERR(dstgroup);
- goto unlock;
- }
+ dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
+ prealloc = NULL;
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
dstgroup->lim_flags = inherit->lim.flags;
@@ -3036,7 +3466,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
}
- if (srcid) {
+ if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
@@ -3063,29 +3493,40 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
+
+ /*
+ * If the source qgroup has parent but the new one doesn't,
+ * we need a full rescan.
+ */
+ if (!inherit && !list_empty(&srcgroup->groups))
+ need_rescan = true;
}
if (!inherit)
goto unlock;
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (int i = 0; i < inherit->num_qgroups; i++) {
if (*i_qgroups) {
- ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+ ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
+ *i_qgroups);
+ qlist_prealloc[i] = NULL;
if (ret)
goto unlock;
}
+ if (srcid) {
+ /* Check if we can do a quick inherit. */
+ ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
+ if (ret < 0)
+ goto unlock;
+ if (ret > 0)
+ need_rescan = true;
+ ret = 0;
+ }
++i_qgroups;
-
- /*
- * If we're doing a snapshot, and adding the snapshot to a new
- * qgroup, the numbers are guaranteed to be incorrect.
- */
- if (srcid)
- need_rescan = true;
}
- for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3106,7 +3547,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
/* Manually tweaking numbers certainly needs a rescan */
need_rescan = true;
}
- for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3135,6 +3576,14 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
qgroup_mark_inconsistent(fs_info);
+ if (qlist_prealloc) {
+ for (int i = 0; i < inherit->num_qgroups; i++)
+ kfree(qlist_prealloc[i]);
+ kfree(qlist_prealloc);
+ }
+ if (free_inherit)
+ kfree(inherit);
+ kfree(prealloc);
return ret;
}
@@ -3156,7 +3605,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
{
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 ref_root = root->root_key.objectid;
+ u64 ref_root = btrfs_root_id(root);
int ret = 0;
LIST_HEAD(qgroup_list);
@@ -3218,9 +3667,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
+ LIST_HEAD(qgroup_list);
if (!is_fstree(ref_root))
return;
@@ -3248,30 +3695,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
*/
num_bytes = qgroup->rsv.values[type];
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
-
- qgroup_rsv_release(fs_info, qg, num_bytes, type);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
+ qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
}
-
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3306,6 +3740,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int slot;
int ret;
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 1;
+
mutex_lock(&fs_info->qgroup_rescan_lock);
extent_root = btrfs_extent_root(fs_info,
fs_info->qgroup_rescan_progress.objectid);
@@ -3386,10 +3823,15 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
- return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+ if (btrfs_fs_closing(fs_info))
+ return true;
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ return true;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return true;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+ return true;
+ return false;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3398,14 +3840,18 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- int err = -ENOMEM;
int ret = 0;
bool stopped = false;
bool did_leaf_rescans = false;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
@@ -3413,18 +3859,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path->search_commit_root = 1;
path->skip_locking = 1;
- err = 0;
- while (!err && !(stopped = rescan_should_stop(fs_info))) {
+ while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
break;
}
- err = qgroup_rescan_leaf(trans, path);
+ ret = qgroup_rescan_leaf(trans, path);
did_leaf_rescans = true;
- if (err > 0)
+ if (ret > 0)
btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
@@ -3434,10 +3879,10 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (err > 0 &&
+ if (ret > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0 || stopped) {
+ } else if (ret < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3452,11 +3897,11 @@ out:
if (did_leaf_rescans) {
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
- err);
+ ret);
}
} else {
trans = NULL;
@@ -3467,11 +3912,11 @@ out:
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
- ret = update_qgroup_status_item(trans);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d",
- err);
+ int ret2 = update_qgroup_status_item(trans);
+
+ if (ret2 < 0) {
+ ret = ret2;
+ btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
}
}
fs_info->qgroup_rescan_running = false;
@@ -3488,11 +3933,11 @@ out:
btrfs_info(fs_info, "qgroup scan paused");
} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
btrfs_info(fs_info, "qgroup scan cancelled");
- } else if (err >= 0) {
+ } else if (ret >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
- err > 0 ? " (inconsistency flag cleared)" : "");
+ ret > 0 ? " (inconsistency flag cleared)" : "");
} else {
- btrfs_err(fs_info, "qgroup scan failed with %d", err);
+ btrfs_err(fs_info, "qgroup scan failed with %d", ret);
}
}
@@ -3506,18 +3951,23 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
{
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
+ return -EINVAL;
+ }
+
if (!init_flags) {
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
+ ret = -ENOTCONN;
}
if (ret)
@@ -3528,15 +3978,13 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- btrfs_warn(fs_info,
- "qgroup rescan is already in progress");
ret = -EINPROGRESS;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
- } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ ret = -ENOTCONN;
+ } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
/* Quota disable is in progress */
ret = -EBUSY;
}
@@ -3557,7 +4005,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_worker, NULL, NULL);
+ btrfs_qgroup_rescan_worker, NULL);
return 0;
}
@@ -3584,7 +4032,6 @@ int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{
int ret = 0;
- struct btrfs_trans_handle *trans;
ret = qgroup_rescan_init(fs_info, 0, 1);
if (ret)
@@ -3601,16 +4048,10 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
* going to clear all tracking information for a clean start.
*/
- trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
- if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
+ ret = btrfs_commit_current_transaction(fs_info->fs_root);
+ if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return PTR_ERR(trans);
- } else if (trans != ERR_PTR(-ENOENT)) {
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return ret;
- }
+ return ret;
}
qgroup_rescan_zero_tracking(fs_info);
@@ -3746,7 +4187,6 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
*/
static int try_flush_qgroup(struct btrfs_root *root)
{
- struct btrfs_trans_handle *trans;
int ret;
/* Can't hold an open transaction or we run the risk of deadlocking. */
@@ -3769,17 +4209,9 @@ static int try_flush_qgroup(struct btrfs_root *root)
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
-
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- goto out;
- }
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -3797,8 +4229,8 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
u64 to_reserve;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
- !is_fstree(root->root_key.objectid) || len == 0)
+ if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
+ !is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -3914,7 +4346,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
BTRFS_QGROUP_RSV_DATA);
if (freed_ret)
*freed_ret = freed;
@@ -3932,8 +4364,11 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
- return 0;
+ if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
+ return clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, NULL);
+ }
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
@@ -3951,7 +4386,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
if (released)
*released = changeset.bytes_changed;
@@ -4045,8 +4480,8 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->root_key.objectid) || num_bytes == 0)
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
+ !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -4082,18 +4517,22 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->root_key.objectid))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
+ !is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -4102,8 +4541,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->root_key.objectid))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
+ !is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4114,8 +4553,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
- num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -4152,18 +4590,24 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->root_key.objectid))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
+ !is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -4192,7 +4636,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
btrfs_ino(inode), unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
@@ -4266,7 +4710,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
@@ -4376,9 +4820,9 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
int ret = 0;
int i;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4449,13 +4893,69 @@ out:
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
{
struct btrfs_qgroup_extent_record *entry;
- struct btrfs_qgroup_extent_record *next;
- struct rb_root *root;
+ unsigned long index;
- root = &trans->delayed_refs.dirty_extent_root;
- rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) {
ulist_free(entry->old_roots);
kfree(entry);
}
- *root = RB_ROOT;
+ xa_destroy(&trans->delayed_refs.dirty_extents);
+}
+
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
+{
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
+ if (!is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
+}
+
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ const struct btrfs_squota_delta *delta)
+{
+ int ret;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *qg;
+ LIST_HEAD(qgroup_list);
+ u64 root = delta->root;
+ u64 num_bytes = delta->num_bytes;
+ const int sign = (delta->is_inc ? 1 : -1);
+
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
+ if (!is_fstree(root))
+ return 0;
+
+ /* If the extent predates enabling quotas, don't count it. */
+ if (delta->generation < fs_info->qgroup_enable_gen)
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, root);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = 0;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qg, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
+
+ qg->excl += num_bytes * sign;
+ qg->rfer += num_bytes * sign;
+ qgroup_dirty(fs_info, qg);
+
+ list_for_each_entry(glist, &qg->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
+ }
+ qgroup_iterator_clean(&qgroup_list);
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1203f0632099..c229256d6fd5 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,12 +6,22 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/kobject.h>
-#include "ulist.h"
-#include "delayed-ref.h"
-#include "misc.h"
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct extent_changeset;
+struct btrfs_delayed_extent_op;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_ioctl_quota_ctl_args;
+struct btrfs_trans_handle;
+struct btrfs_delayed_ref_root;
+struct btrfs_inode;
/*
* Btrfs qgroup overview
@@ -101,15 +111,22 @@
* subtree rescan for them.
*/
-#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
-#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+/*
+ * These flags share the flags field of the btrfs_qgroup_status_item with the
+ * persisted flags defined in btrfs_tree.h.
+ *
+ * To minimize the chance of collision with new persisted status flags, these
+ * count backwards from the MSB.
+ */
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
+
+#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3)
/*
* Record a dirty extent, and info qgroup to update quota on it
- * TODO: Use kmem cache to alloc it.
*/
struct btrfs_qgroup_extent_record {
- struct rb_node node;
u64 bytenr;
u64 num_bytes;
@@ -229,6 +246,24 @@ struct btrfs_qgroup {
* finished.
*/
struct list_head iterator;
+
+ /*
+ * For nested iterator usage.
+ *
+ * Here we support at most one level of nested iterator calls like:
+ *
+ * LIST_HEAD(all_qgroups);
+ * {
+ * LIST_HEAD(local_qgroups);
+ * qgroup_iterator_add(local_qgroups, qg);
+ * qgroup_iterator_nested_add(all_qgroups, qg);
+ * do_some_work(local_qgroups);
+ * qgroup_iterator_clean(local_qgroups);
+ * }
+ * do_some_work(all_qgroups);
+ * qgroup_iterator_nested_clean(all_qgroups);
+ */
+ struct list_head nested_iterator;
struct rb_node node; /* tree of qgroups */
/*
@@ -244,6 +279,27 @@ struct btrfs_qgroup {
struct kobject kobj;
};
+/* Glue structure to represent the relations between qgroups. */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
+struct btrfs_squota_delta {
+ /* The fstree root this delta counts against. */
+ u64 root;
+ /* The number of bytes in the extent being counted. */
+ u64 num_bytes;
+ /* The generation the extent was created in. */
+ u64 generation;
+ /* Whether we are using or freeing the extent. */
+ bool is_inc;
+ /* Whether the extent is data or metadata. */
+ bool is_data;
+};
+
static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
{
return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
@@ -258,98 +314,44 @@ enum {
ENUM_BIT(QGROUP_FREE),
};
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode {
+ BTRFS_QGROUP_MODE_DISABLED,
+ BTRFS_QGROUP_MODE_FULL,
+ BTRFS_QGROUP_MODE_SIMPLE
+};
+
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info);
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
-/*
- * Inform qgroup to trace one dirty extent, its info is recorded in @record.
- * So qgroup can account it at transaction committing time.
- *
- * No lock version, caller must acquire delayed ref lock and allocated memory,
- * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
- *
- * Return 0 for success insert
- * Return >0 for existing record, caller can free @record safely.
- * Error is not possible
- */
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record);
-
-/*
- * Post handler after qgroup_trace_extent_nolock().
- *
- * NOTE: Current qgroup does the expensive backref walk at transaction
- * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
- * new transaction.
- * This is designed to allow btrfs_find_all_roots() to get correct new_roots
- * result.
- *
- * However for old_roots there is no need to do backref walk at that time,
- * since we search commit roots to walk backref and result will always be
- * correct.
- *
- * Due to the nature of no lock version, we can't do backref there.
- * So we must call btrfs_qgroup_trace_extent_post() after exiting
- * spinlock context.
- *
- * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
- * using current root, then we can move all expensive backref walk out of
- * transaction committing, but not now as qgroup accounting will be wrong again.
- */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
-
-/*
- * Inform qgroup to trace one dirty extent, specified by @bytenr and
- * @num_bytes.
- * So qgroup can account it at commit trans time.
- *
- * Better encapsulated version, with memory allocation and backref walk for
- * commit roots.
- * So this can sleep.
- *
- * Return 0 if the operation is done.
- * Return <0 for error, like memory allocation failure or invalid parameter
- * (NULL trans)
- */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
-
-/*
- * Inform qgroup to trace all leaf items of data
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM)
- */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb);
-/*
- * Inform qgroup to trace a whole subtree, including all its child tree
- * blocks and data.
- * The root tree block is specified by @root_eb.
- *
- * Normally used by relocation(tree block swap) and subvolume deletion.
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM or tree search error)
- */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
@@ -358,14 +360,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots);
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit);
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
@@ -417,20 +423,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
BTRFS_QGROUP_RSV_META_PREALLOC);
}
-/*
- * Per-transaction meta reservation should be all freed at transaction commit
- * time
- */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
-
-/*
- * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
- *
- * This is called when preallocated meta reservation needs to be used.
- * Normally after btrfs_join_transaction() call.
- */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
@@ -447,6 +441,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ const struct btrfs_squota_delta *delta);
#endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..4c859b550f6c
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "print-tree.h"
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 found_start;
+ u64 found_end;
+ u64 end = start + length;
+ int slot;
+ int ret;
+
+ if (!stripe_root)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = start;
+ key.type = BTRFS_RAID_STRIPE_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ found_start = key.objectid;
+ found_end = found_start + key.offset;
+
+ /* That stripe ends before we start, we're done. */
+ if (found_end <= start)
+ break;
+
+ trace_btrfs_raid_extent_delete(fs_info, start, end,
+ found_start, found_end);
+
+ ASSERT(found_start >= start && found_end <= end);
+ ret = btrfs_del_item(trans, stripe_root, path);
+ if (ret)
+ break;
+
+ start += key.offset;
+ length -= key.offset;
+ if (length == 0)
+ break;
+
+ btrfs_release_path(path);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_raid_extent_item(struct btrfs_trans_handle *trans,
+ struct btrfs_key *key,
+ struct btrfs_stripe_extent *stripe_extent,
+ const size_t item_size)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
+ 0, 1);
+ if (ret)
+ return (ret == 1 ? ret : -EINVAL);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
+ item_size);
+ btrfs_mark_buffer_dirty(trans, leaf);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key stripe_key;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+ struct btrfs_stripe_extent *stripe_extent;
+ const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+ int ret;
+
+ stripe_extent = kzalloc(item_size, GFP_NOFS);
+ if (!stripe_extent) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ btrfs_end_transaction(trans);
+ return -ENOMEM;
+ }
+
+ trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
+ num_stripes);
+ for (int i = 0; i < num_stripes; i++) {
+ u64 devid = bioc->stripes[i].dev->devid;
+ u64 physical = bioc->stripes[i].physical;
+ u64 length = bioc->stripes[i].length;
+ struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+ if (length == 0)
+ length = bioc->size;
+
+ btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+ btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+ }
+
+ stripe_key.objectid = bioc->logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = bioc->size;
+
+ ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+ item_size);
+ if (ret == -EEXIST)
+ ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
+ item_size);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ kfree(stripe_extent);
+
+ return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_io_context *bioc;
+ int ret;
+
+ if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+ return 0;
+
+ list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret)
+ return ret;
+ }
+
+ while (!list_empty(&ordered_extent->bioc_list)) {
+ bioc = list_first_entry(&ordered_extent->bioc_list,
+ typeof(*bioc), rst_ordered_entry);
+ list_del(&bioc->rst_ordered_entry);
+ btrfs_put_bioc(bioc);
+ }
+
+ return 0;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_key stripe_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ const u64 end = logical + *length;
+ int num_stripes;
+ u64 offset;
+ u64 found_logical;
+ u64 found_length;
+ u64 found_end;
+ int slot;
+ int ret;
+
+ stripe_key.objectid = logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (stripe->rst_search_commit_root) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+
+ ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret) {
+ if (path->slots[0] != 0)
+ path->slots[0]--;
+ }
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ found_logical = found_key.objectid;
+ found_length = found_key.offset;
+ found_end = found_logical + found_length;
+
+ if (found_logical > end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (in_range(logical, found_logical, found_length))
+ break;
+
+ ret = btrfs_next_item(stripe_root, path);
+ if (ret)
+ goto out;
+ }
+
+ offset = logical - found_logical;
+
+ /*
+ * If we have a logically contiguous, but physically non-continuous
+ * range, we need to split the bio. Record the length after which we
+ * must split the bio.
+ */
+ if (end > found_end)
+ *length -= end - found_end;
+
+ num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+ stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ for (int i = 0; i < num_stripes; i++) {
+ struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+ u64 devid = btrfs_raid_stride_devid(leaf, stride);
+ u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+ if (devid != stripe->dev->devid)
+ continue;
+
+ if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+ continue;
+
+ stripe->physical = physical + offset;
+
+ trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
+ stripe->physical, devid);
+
+ ret = 0;
+ goto free_path;
+ }
+
+ /* If we're here, we haven't found the requested devid in the stripe. */
+ ret = -ENOENT;
+out:
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
+ btrfs_debug(fs_info,
+ "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+ logical, logical + *length, stripe->dev->devid,
+ btrfs_bg_type_to_raid_name(map_type));
+ }
+free_path:
+ btrfs_free_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..1ac1c21aac2f
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#include <linux/types.h>
+#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+
+#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID1_MASK | \
+ BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10)
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_fs_info;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe);
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+ u64 map_type)
+{
+ u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+ return false;
+
+ if (type != BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
+ return true;
+
+ return false;
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+ return item_size / sizeof(struct btrfs_raid_stride);
+}
+
+#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..8afadf994b8c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -14,7 +14,6 @@
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -41,6 +40,85 @@
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
+{
+ if (unlikely(!bioc)) {
+ btrfs_crit(fs_info, "bioc=NULL");
+ return;
+ }
+ btrfs_crit(fs_info,
+"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
+ bioc->logical, bioc->full_stripe_logical, bioc->size,
+ bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
+ bioc->replace_stripe_src, bioc->num_stripes);
+ for (int i = 0; i < bioc->num_stripes; i++) {
+ btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
+ i, bioc->stripes[i].dev->devid,
+ bioc->stripes[i].physical);
+ }
+}
+
+static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ dump_bioc(fs_info, rbio->bioc);
+ btrfs_crit(fs_info,
+"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
+ rbio->flags, rbio->nr_sectors, rbio->nr_data,
+ rbio->real_stripes, rbio->stripe_nsectors,
+ rbio->scrubp, rbio->dbitmap);
+}
+
+#define ASSERT_RBIO(expr, rbio) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "logical=%llu", (logical)); \
+ } \
+ ASSERT((expr)); \
+})
+
/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
@@ -122,8 +200,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
- int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
- int i;
+ unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table)
return 0;
@@ -144,7 +221,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
h = table->table;
- for (i = 0; i < num_entries; i++) {
+ for (unsigned int i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
@@ -332,12 +409,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
static void merge_rbio(struct btrfs_raid_bio *dest,
struct btrfs_raid_bio *victim)
{
- bio_list_merge(&dest->bio_list, &victim->bio_list);
+ bio_list_merge_init(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- bio_list_init(&victim->bio_list);
}
/*
@@ -594,8 +670,8 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
unsigned int stripe_nr,
unsigned int sector_nr)
{
- ASSERT(stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
@@ -875,8 +951,10 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector;
int index;
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
ASSERT(index >= 0 && index < rbio->nr_sectors);
@@ -918,6 +996,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
+ /*
+ * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
+ * (limited by u8).
+ */
+ ASSERT(real_stripes >= 2);
+ ASSERT(real_stripes <= U8_MAX);
+
rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -955,6 +1040,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+ ASSERT(rbio->nr_data > 0);
return rbio;
}
@@ -964,7 +1050,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -979,7 +1065,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages);
+ rbio->stripe_pages + data_pages, false);
if (ret < 0)
return ret;
@@ -1051,8 +1137,10 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
* thus it can be larger than rbio->real_stripe.
* So here we check against bioc->num_stripes, not rbio->real_stripes.
*/
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
ASSERT(sector->page);
stripe = &rbio->bioc->stripes[stripe_nr];
@@ -1181,6 +1269,26 @@ static inline void bio_list_put(struct bio_list *bio_list)
bio_put(bio);
}
+static void assert_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+ !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ /*
+ * At least two stripes (2 disks RAID5), and since real_stripes is U8,
+ * we won't go beyond 256 disks anyway.
+ */
+ ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
+ ASSERT_RBIO(rbio->nr_data > 0, rbio);
+
+ /*
+ * This is another check to make sure nr data stripes is smaller
+ * than total stripes.
+ */
+ ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
+}
+
/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
@@ -1212,6 +1320,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
pointers[stripe++] = kmap_local_page(sector->page) +
sector->pgoff;
+ assert_rbio(rbio);
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
@@ -1530,7 +1639,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
@@ -1549,7 +1658,6 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct work_struct work;
};
/*
@@ -1615,9 +1723,10 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
const u32 sectorsize = fs_info->sectorsize;
u64 cur_logical;
- ASSERT(orig_logical >= full_stripe_start &&
- orig_logical + orig_len <= full_stripe_start +
- rbio->nr_data * BTRFS_STRIPE_LEN);
+ ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN,
+ rbio, orig_logical);
bio_list_add(&rbio->bio_list, orig_bio);
rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
@@ -2363,7 +2472,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
break;
}
}
- ASSERT(i < rbio->real_stripes);
+ ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
return rbio;
@@ -2474,6 +2583,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
}
if (has_qstripe) {
+ assert_rbio(rbio);
/* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
@@ -2528,7 +2638,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
* Replace is running and our parity stripe needs to be duplicated to
* the target device. Check we have a valid source stripe number.
*/
- ASSERT(rbio->bioc->replace_stripe_src >= 0);
+ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..0d7b4c2fb6ae 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,9 +7,18 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/bio.h>
+#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "volumes.h"
+struct page;
+struct sector_ptr;
+struct btrfs_fs_info;
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -164,7 +173,7 @@ struct raid56_bio_trace_info {
u8 stripe_nr;
};
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c2b66d155ef..1c2d7cb1fe6f 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -6,6 +6,12 @@
#ifndef BTRFS_RCU_STRING_H
#define BTRFS_RCU_STRING_H
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/printk.h>
+
struct rcu_string {
struct rcu_head rcu;
char str[];
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 28ac7995716e..2928abf7eb82 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -485,6 +486,13 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
ret = add_shared_data_ref(fs_info, offset, count,
key->objectid, key->offset);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
+ break;
default:
btrfs_err(fs_info, "invalid key type in iref");
ret = -EINVAL;
@@ -652,7 +660,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * Called when we modify a ref for a bytenr.
*
* This will add an action item to the given bytenr and do sanity checks to make
* sure we haven't messed something up. If we are making a new allocation and
@@ -670,7 +678,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
int ret = 0;
bool metadata;
u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
+ u64 num_bytes = generic_ref->num_bytes;
u64 parent = generic_ref->parent;
u64 ref_root = 0;
u64 owner = 0;
@@ -681,11 +689,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.owning_root;
+ ref_root = generic_ref->ref_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.owning_root;
- owner = generic_ref->data_ref.ino;
+ ref_root = generic_ref->ref_root;
+ owner = generic_ref->data_ref.objectid;
offset = generic_ref->data_ref.offset;
}
metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 855de37719b5..3511e1a5c96b 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -6,7 +6,16 @@
#ifndef BTRFS_REF_VERIFY_H
#define BTRFS_REF_VERIFY_H
+#include <linux/types.h>
+#include <linux/rbtree_types.h>
+
+struct btrfs_fs_info;
+struct btrfs_ref;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+
+#include <linux/spinlock.h>
+
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9f60aa79a8c5..f0824c948cb7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
const u64 olen,
int no_time_update)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
if (!no_time_update) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
}
/*
* We round up to the block size at eof when determining which
@@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -67,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
@@ -84,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret)
goto out;
- page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(mapping));
- if (!page) {
+ folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ btrfs_alloc_write_mask(mapping));
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out_unlock;
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
@@ -116,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
- memcpy_to_page(page, offset_in_page(file_offset), data_start,
- datal);
+ memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
+ datal);
} else {
- ret = btrfs_decompress(comp_type, data_start, page,
- offset_in_page(file_offset),
+ ret = btrfs_decompress(comp_type, data_start, folio,
+ offset_in_folio(folio, file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
}
/*
@@ -140,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size)
- memzero_page(page, datal, block_size - datal);
+ folio_zero_range(folio, datal, block_size - datal);
- btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
- btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (!IS_ERR(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
@@ -175,7 +175,7 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
struct btrfs_root *root = BTRFS_I(dst)->root;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
@@ -338,7 +338,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
const u64 destoff, int no_time_update)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = NULL;
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
@@ -617,35 +617,6 @@ out:
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- u64 range1_end = loff1 + len - 1;
- u64 range2_end = loff2 + len - 1;
-
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- }
-
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
-
- btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
- btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
-}
-
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
{
if (inode1 < inode2)
@@ -663,17 +634,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
+ const u64 end = dst_loff + len - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -691,7 +666,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
+ btrfs_root_id(root_dst),
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
@@ -725,13 +700,15 @@ out:
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
+ struct extent_state *cached_state = NULL;
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
int ret;
int wb_ret;
u64 len = olen;
u64 bs = fs_info->sectorsize;
+ u64 end;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -757,26 +734,29 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
- ret = btrfs_wait_ordered_range(inode, wb_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
destoff - wb_start);
if (ret)
return ret;
}
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ end = destoff + len - 1;
+ lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
* range, so wait for writeback to complete before truncating pages
* from the page cache. This is a rare case.
*/
- wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
ret = ret ? ret : wb_ret;
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -856,11 +836,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
index ecb309b4dad0..1e291f7d85c4 100644
--- a/fs/btrfs/reflink.h
+++ b/fs/btrfs/reflink.h
@@ -3,7 +3,9 @@
#ifndef BTRFS_REFLINK_H
#define BTRFS_REFLINK_H
-#include <linux/fs.h>
+#include <linux/types.h>
+
+struct file;
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 299eac696eb4..79eb984041dd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -36,6 +36,7 @@
#include "relocation.h"
#include "super.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
/*
* Relocation overview
@@ -111,8 +112,8 @@ struct tree_block {
}; /* Use rb_simple_node for search/insert */
u64 owner;
struct btrfs_key key;
- unsigned int level:8;
- unsigned int key_ready:1;
+ u8 level;
+ bool key_ready;
};
#define MAX_EXTENTS 128
@@ -122,6 +123,13 @@ struct file_extent_cluster {
u64 end;
u64 boundary[MAX_EXTENTS];
unsigned int nr;
+ u64 owning_root;
+};
+
+/* Stages of data relocation. */
+enum reloc_stage {
+ MOVE_DATA_EXTENTS,
+ UPDATE_DATA_PTRS
};
struct reloc_control {
@@ -155,16 +163,12 @@ struct reloc_control {
u64 search_start;
u64 extents_found;
- unsigned int stage:8;
- unsigned int create_reloc_tree:1;
- unsigned int merge_reloc_tree:1;
- unsigned int found_file_extent:1;
+ enum reloc_stage stage;
+ bool create_reloc_tree;
+ bool merge_reloc_tree;
+ bool found_file_extent;
};
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS 0
-#define UPDATE_DATA_PTRS 1
-
static void mark_block_processed(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
@@ -180,13 +184,6 @@ static void mark_block_processed(struct reloc_control *rc,
node->processed = 1;
}
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
- tree->rb_root = RB_ROOT;
- spin_lock_init(&tree->lock);
-}
-
/*
* walk up backref nodes until reach node presents tree root
*/
@@ -413,20 +410,19 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_edge *edge;
int ret;
- int err = 0;
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -437,10 +433,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
do {
ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
node_key, cur);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+
edge = list_first_entry_or_null(&cache->pending_edge,
struct btrfs_backref_edge, list[UPPER]);
/*
@@ -455,19 +450,18 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
/* Finish the upper linkage of newly added edges/nodes */
ret = btrfs_backref_finish_upper_links(cache, node);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (handle_useless_nodes(rc, node))
node = NULL;
out:
- btrfs_backref_iter_free(iter);
+ btrfs_free_path(iter->path);
+ kfree(iter);
btrfs_free_path(path);
- if (err) {
+ if (ret) {
btrfs_backref_error_cleanup(cache, node);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
ASSERT(!node || !node->detached);
ASSERT(list_empty(&cache->useless_node) &&
@@ -564,7 +558,7 @@ fail:
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
-static int __must_check __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -690,9 +684,28 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = objectid;
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
+ objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
+ ret = -EUCLEAN;
+ goto fail;
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
@@ -734,7 +747,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
@@ -757,7 +770,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
fail:
kfree(root_item);
@@ -804,7 +817,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
*/
if (root->reloc_root) {
reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return 0;
}
@@ -812,8 +825,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
* We are merging reloc roots, we do not need new reloc trees. Also
* reloc trees never need their own reloc tree.
*/
- if (!rc->create_reloc_tree ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
return 0;
if (!trans->reloc_reserved) {
@@ -821,7 +833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
clear_rsv = 1;
}
- reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
if (clear_rsv)
trans->block_rsv = rsv;
if (IS_ERR(reloc_root))
@@ -888,60 +900,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
/*
- * helper to find first cached inode with inode number >= objectid
- * in a subvolume
- */
-static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
-{
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
-
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- objectid = btrfs_ino(entry) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
- }
- spin_unlock(&root->inode_lock);
- return NULL;
-}
-
-/*
* get new location of data
*/
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
@@ -957,7 +915,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
if (!path)
return -ENOMEM;
- bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
@@ -1001,7 +959,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 parent;
u64 bytenr;
u64 new_bytenr = 0;
@@ -1017,7 +975,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
return 0;
/* reloc trees always use full backref */
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
parent = leaf->start;
else
parent = 0;
@@ -1046,15 +1004,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
* if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
if (first) {
- inode = find_next_inode(root, key.objectid);
+ inode = btrfs_find_first_inode(root, key.objectid);
first = 0;
- } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
- inode = find_next_inode(root, key.objectid);
+ } else if (inode && btrfs_ino(inode) < key.objectid) {
+ btrfs_add_delayed_iput(inode);
+ inode = btrfs_find_first_inode(root, key.objectid);
}
- if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ if (inode && btrfs_ino(inode) == key.objectid) {
struct extent_state *cached_state = NULL;
end = key.offset +
@@ -1063,16 +1021,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize));
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
- ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end,
- &cached_state);
- if (!ret)
+ /* Take mmap lock to serialize with reflinks. */
+ if (!down_read_trylock(&inode->i_mmap_lock))
continue;
+ ret = try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
+ if (!ret) {
+ up_read(&inode->i_mmap_lock);
+ continue;
+ }
- btrfs_drop_extent_map_range(BTRFS_I(inode),
- key.offset, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, key.offset, end, true);
+ unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
+ up_read(&inode->i_mmap_lock);
}
}
@@ -1090,22 +1052,28 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- num_bytes, parent);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, parent);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1115,7 +1083,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
return ret;
}
@@ -1161,8 +1129,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1314,39 +1282,54 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(trans, path->nodes[level]);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
- blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = btrfs_root_id(src);
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
- true);
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(dest);
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
- blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ /* We don't know the real owning_root, use 0. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
- blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
- 0, true);
+ /* We don't know the real owning_root, use 0. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1454,7 +1437,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
const struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 objectid;
u64 start, end;
u64 ino;
@@ -1464,23 +1447,24 @@ static int invalidate_extent_cache(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
cond_resched();
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
if (objectid > max_key->objectid)
break;
- inode = find_next_inode(root, objectid);
+ inode = btrfs_find_first_inode(root, objectid);
if (!inode)
break;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
if (ino > max_key->objectid) {
- iput(inode);
+ iput(&inode->vfs_inode);
break;
}
objectid = ino + 1;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->vfs_inode.i_mode))
continue;
if (unlikely(min_key->objectid == ino)) {
@@ -1513,9 +1497,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, start, end, true);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1550,7 +1534,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
- ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
ASSERT(reloc_root);
reloc_root_item = &reloc_root->root_item;
@@ -1579,7 +1563,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
@@ -1708,7 +1692,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* btrfs_update_reloc_root() and update our root item
* appropriately.
*/
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -1829,7 +1813,7 @@ again:
}
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
reloc_root = list_entry(rc->reloc_roots.next,
@@ -1854,13 +1838,13 @@ again:
if (root->reloc_root) {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- root->reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(root->reloc_root),
root->reloc_root->root_key.type,
root->reloc_root->root_key.offset,
btrfs_root_generation(
&root->reloc_root->root_item),
- reloc_root->root_key.objectid,
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -1868,8 +1852,8 @@ again:
} else {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -2051,7 +2035,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
int ret;
- if (reloc_root->last_trans == trans->transid)
+ if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
@@ -2126,7 +2110,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
return ERR_PTR(-EUCLEAN);
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
@@ -2233,7 +2217,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
fs_root = root;
if (next != node)
@@ -2249,9 +2233,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
return fs_root;
}
-static noinline_for_stack
-u64 calcu_metadata_size(struct reloc_control *rc,
- struct btrfs_backref_node *node, int reserve)
+static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *next = node;
@@ -2260,12 +2243,12 @@ u64 calcu_metadata_size(struct reloc_control *rc,
u64 num_bytes = 0;
int index = 0;
- BUG_ON(reserve && node->processed);
+ BUG_ON(node->processed);
while (next) {
cond_resched();
while (1) {
- if (next->processed && (reserve || next != node))
+ if (next->processed)
break;
num_bytes += fs_info->nodesize;
@@ -2293,7 +2276,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
int ret;
u64 tmp;
- num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+ num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@@ -2356,8 +2339,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_ref ref = { 0 };
-
cond_resched();
upper = edge->node[UPPER];
@@ -2445,18 +2426,23 @@ static int do_relocation(struct btrfs_trans_handle *trans,
*/
ASSERT(node->eb == eb);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = node->eb->start,
+ .num_bytes = blocksize,
+ .parent = upper->eb->start,
+ .owning_root = btrfs_header_owner(upper->eb),
+ .ref_root = btrfs_header_owner(upper->eb),
+ };
+
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
btrfs_mark_buffer_dirty(trans, upper->eb);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- node->eb->start, blocksize,
- upper->eb->start);
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb),
- root->root_key.objectid, false);
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2565,7 +2551,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
u32 blocksize = rc->extent_root->fs_info->nodesize;
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2592,7 +2578,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
else
btrfs_node_key_to_cpu(eb, &block->key, 0);
free_extent_buffer(eb);
- block->key_ready = 1;
+ block->key_ready = true;
return 0;
}
@@ -2708,12 +2694,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
- int ret;
- int err = 0;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_free_blocks;
}
@@ -2728,8 +2713,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Get first keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
- err = get_tree_block_key(fs_info, block);
- if (err)
+ ret = get_tree_block_key(fs_info, block);
+ if (ret)
goto out_free_path;
}
}
@@ -2739,35 +2724,33 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
- err = PTR_ERR(node);
+ ret = PTR_ERR(node);
goto out;
}
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
break;
- }
}
out:
- err = finish_pending_nodes(trans, rc, path, err);
+ ret = finish_pending_nodes(trans, rc, path, ret);
out_free_path:
btrfs_free_path(path);
out_free_blocks:
free_block_list(blocks);
- return err;
+ return ret;
}
-static noinline_for_stack int prealloc_file_extent_cluster(
- struct btrfs_inode *inode,
- const struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc)
{
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = inode->index_cnt;
+ u64 offset = inode->reloc_block_group_start;
u64 num_bytes;
int nr;
int ret = 0;
@@ -2782,7 +2765,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* btrfs_do_readpage() call of previously relocated file cluster.
*
* If the current cluster starts in the above range, btrfs_do_readpage()
- * will skip the read, and relocate_one_page() will later writeback
+ * will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
@@ -2791,7 +2774,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
+ struct folio *folio;
ASSERT(sectorsize < PAGE_SIZE);
ASSERT(IS_ALIGNED(i_size, sectorsize));
@@ -2822,16 +2805,16 @@ static noinline_for_stack int prealloc_file_extent_cluster(
clear_extent_bits(&inode->io_tree, i_size,
round_up(i_size, PAGE_SIZE) - 1,
EXTENT_UPTODATE);
- page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
/*
* If page is freed we don't need to do anything then, as we
* will re-read the whole page anyway.
*/
- if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ if (!IS_ERR(folio)) {
+ btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
round_up(i_size, PAGE_SIZE) - i_size);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -2869,11 +2852,14 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
- u64 start, u64 end, u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc)
{
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
struct extent_map *em;
struct extent_state *cached_state = NULL;
+ u64 offset = inode->reloc_block_group_start;
+ u64 start = rc->cluster.start - offset;
+ u64 end = rc->cluster.end - offset;
int ret = 0;
em = alloc_extent_map();
@@ -2882,13 +2868,14 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->start = start;
em->len = end + 1 - start;
- em->block_len = em->len;
- em->block_start = block_start;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ em->disk_bytenr = rc->cluster.start;
+ em->disk_num_bytes = em->len;
+ em->ram_bytes = em->len;
+ em->flags |= EXTENT_FLAG_PINNED;
+
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ ret = btrfs_replace_extent_map_range(inode, em, false);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
free_extent_map(em);
return ret;
@@ -2916,68 +2903,91 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
return cluster->boundary[cluster_nr + 1] - 1;
}
-static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
- const struct file_extent_cluster *cluster,
- int *cluster_nr, unsigned long page_index)
+static int relocate_one_folio(struct reloc_control *rc,
+ struct file_ra_state *ra,
+ int *cluster_nr, unsigned long index)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 offset = BTRFS_I(inode)->index_cnt;
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct inode *inode = rc->data_inode;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- struct page *page;
- u64 page_start;
- u64 page_end;
+ struct folio *folio;
+ u64 folio_start;
+ u64 folio_end;
u64 cur;
int ret;
+ const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
+
+ ASSERT(index <= last_index);
+again:
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio)) {
- ASSERT(page_index <= last_index);
- page = find_lock_page(inode->i_mapping, page_index);
- if (!page) {
- page_cache_sync_readahead(inode->i_mapping, ra, NULL,
- page_index, last_index + 1 - page_index);
- page = find_or_create_page(inode->i_mapping, page_index, mask);
- if (!page)
- return -ENOMEM;
+ /*
+ * On relocation we're doing readahead on the relocation inode,
+ * but if the filesystem is backed by a RAID stripe tree we can
+ * get ENOENT (e.g. due to preallocated extents not being
+ * mapped in the RST) from the lookup.
+ *
+ * But readahead doesn't handle the error and submits invalid
+ * reads to the device, causing a assertion failures.
+ */
+ if (!use_rst)
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ index, last_index + 1 - index);
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
}
- if (PageReadahead(page))
+ WARN_ON(folio_order(folio));
+
+ if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
- page_folio(page), page_index,
- last_index + 1 - page_index);
+ folio, last_index + 1 - index);
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
- goto release_page;
+ goto release_folio;
+ }
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
}
}
/*
- * We could have lost page private when we dropped the lock to read the
- * page above, make sure we set_page_extent_mapped here so we have any
+ * We could have lost folio private when we dropped the lock to read the
+ * folio above, make sure we set_page_extent_mapped here so we have any
* of the subpage blocksize stuff we need in place.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
- goto release_page;
+ goto release_folio;
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
+ folio_start = folio_pos(folio);
+ folio_end = folio_start + PAGE_SIZE - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
- * inside the page.
+ * inside the folio.
*/
- cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
- while (cur <= page_end) {
+ cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= folio_end) {
struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
- u64 clamped_start = max(page_start, extent_start);
- u64 clamped_end = min(page_end, extent_end);
+ u64 clamped_start = max(folio_start, extent_start);
+ u64 clamped_end = min(folio_end, extent_end);
u32 clamped_len = clamped_end + 1 - clamped_start;
/* Reserve metadata for this range */
@@ -2985,7 +2995,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, clamped_len,
false);
if (ret)
- goto release_page;
+ goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
@@ -3001,19 +3011,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
clamped_len);
- goto release_page;
+ goto release_folio;
}
- btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
/*
- * Set the boundary if it's inside the page.
+ * Set the boundary if it's inside the folio.
* Data relocation requires the destination extents to have the
* same size as the source.
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset,
- page_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
@@ -3036,8 +3045,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
break;
}
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
@@ -3045,16 +3054,17 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
ret = -ECANCELED;
return ret;
-release_page:
- unlock_page(page);
- put_page(page);
+release_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
-static int relocate_file_extent_cluster(struct inode *inode,
- const struct file_extent_cluster *cluster)
+static int relocate_file_extent_cluster(struct reloc_control *rc)
{
- u64 offset = BTRFS_I(inode)->index_cnt;
+ struct inode *inode = rc->data_inode;
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
unsigned long index;
unsigned long last_index;
struct file_ra_state *ra;
@@ -3068,21 +3078,20 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
+ ret = prealloc_file_extent_cluster(rc);
if (ret)
goto out;
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
- cluster->end - offset, cluster->start);
+ ret = setup_relocation_extent_mapping(rc);
if (ret)
goto out;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
- ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+ ret = relocate_one_folio(rc, ra, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3090,21 +3099,53 @@ out:
return ret;
}
-static noinline_for_stack int relocate_data_extent(struct inode *inode,
- const struct btrfs_key *extent_key,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct reloc_control *rc,
+ const struct btrfs_key *extent_key)
{
+ struct inode *inode = rc->data_inode;
+ struct file_extent_cluster *cluster = &rc->cluster;
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
}
- if (!cluster->nr)
+ /*
+ * Under simple quotas, we set root->relocation_src_root when we find
+ * the extent. If adjacent extents have different owners, we can't merge
+ * them while relocating. Handle this by storing the owning root that
+ * started a cluster and if we see an extent from a different root break
+ * cluster formation (just like the above case of non-adjacent extents).
+ *
+ * Without simple quotas, relocation_src_root is always 0, so we should
+ * never see a mismatch, and it should have no effect on relocation
+ * clusters.
+ */
+ if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
+ u64 tmp = root->relocation_src_root;
+
+ /*
+ * root->relocation_src_root is the state that actually affects
+ * the preallocation we do here, so set it to the root owning
+ * the cluster we need to relocate.
+ */
+ root->relocation_src_root = cluster->owning_root;
+ ret = relocate_file_extent_cluster(rc);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ /* And reset it back for the current extent's owning root. */
+ root->relocation_src_root = tmp;
+ }
+
+ if (!cluster->nr) {
cluster->start = extent_key->objectid;
+ cluster->owning_root = root->relocation_src_root;
+ }
else
BUG_ON(cluster->nr >= MAX_EXTENTS);
cluster->end = extent_key->objectid + extent_key->offset - 1;
@@ -3112,7 +3153,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode,
cluster->nr++;
if (cluster->nr >= MAX_EXTENTS) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3210,7 +3251,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key.objectid = rc->extent_root->fs_info->nodesize;
block->key.offset = generation;
block->level = level;
- block->key_ready = 0;
+ block->key_ready = false;
block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
@@ -3306,7 +3347,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
if (inode)
goto truncate;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget(ino, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3553,7 +3594,7 @@ int prepare_to_relocate(struct reloc_control *rc)
if (ret)
return ret;
- rc->create_reloc_tree = 1;
+ rc->create_reloc_tree = true;
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
@@ -3631,6 +3672,21 @@ restart:
struct btrfs_extent_item);
flags = btrfs_extent_flags(path->nodes[0], ei);
+ /*
+ * If we are relocating a simple quota owned extent item, we
+ * need to note the owner on the reloc data root so that when
+ * we allocate the replacement item, we can attribute it to the
+ * correct eventual owner (rather than the reloc data root).
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
+ u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
+ path->nodes[0],
+ path->slots[0]);
+
+ root->relocation_src_root = owning_root_id;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
@@ -3663,9 +3719,8 @@ restart:
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
- rc->found_file_extent = 1;
- ret = relocate_data_extent(rc->data_inode,
- &key, &rc->cluster);
+ rc->found_file_extent = true;
+ ret = relocate_data_extent(rc, &key);
if (ret < 0) {
err = ret;
break;
@@ -3694,13 +3749,12 @@ restart:
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode,
- &rc->cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret < 0)
err = ret;
}
- rc->create_reloc_tree = 0;
+ rc->create_reloc_tree = false;
set_reloc_control(rc);
btrfs_backref_release_cache(&rc->backref_cache);
@@ -3718,7 +3772,7 @@ restart:
merge_reloc_roots(rc);
- rc->merge_reloc_tree = 0;
+ rc->merge_reloc_tree = false;
unset_reloc_control(rc);
btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
@@ -3811,7 +3865,7 @@ static noinline_for_stack struct inode *create_reloc_inode(
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
- int err = 0;
+ int ret = 0;
root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
@@ -3820,31 +3874,31 @@ static noinline_for_stack struct inode *create_reloc_inode(
return ERR_CAST(trans);
}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
goto out;
- err = __insert_orphan_inode(trans, root, objectid);
- if (err)
+ ret = __insert_orphan_inode(trans, root, objectid);
+ if (ret)
goto out;
- inode = btrfs_iget(fs_info->sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
- err = PTR_ERR(inode);
+ ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
- BTRFS_I(inode)->index_cnt = group->start;
+ BTRFS_I(inode)->reloc_block_group_start = group->start;
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err) {
+ if (ret) {
iput(inode);
- inode = ERR_PTR(err);
+ inode = ERR_PTR(ret);
}
return inode;
}
@@ -3900,8 +3954,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
- btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
- mapping_tree_init(&rc->reloc_root_tree);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
+ rc->reloc_root_tree.rb_root = RB_ROOT;
+ spin_lock_init(&rc->reloc_root_tree.lock);
extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -3921,19 +3976,17 @@ static void free_reloc_control(struct reloc_control *rc)
/*
* Print the block group being relocated
*/
-static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group)
+static void describe_relocation(struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
- btrfs_info(fs_info,
- "relocating block group %llu flags %s",
+ btrfs_info(block_group->fs_info, "relocating block group %llu flags %s",
block_group->start, buf);
}
-static const char *stage_to_string(int stage)
+static const char *stage_to_string(enum reloc_stage stage)
{
if (stage == MOVE_DATA_EXTENTS)
return "move data extents";
@@ -4037,19 +4090,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- describe_relocation(fs_info, rc->block_group);
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
- btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->start,
- rc->block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group);
ret = btrfs_zone_finish(rc->block_group);
WARN_ON(ret && ret != -EAGAIN);
while (1) {
- int finishes_stage;
+ enum reloc_stage finishes_stage;
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
@@ -4068,7 +4119,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
(u64)-1);
if (ret)
err = ret;
@@ -4140,8 +4191,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
struct extent_buffer *leaf;
struct reloc_control *rc = NULL;
struct btrfs_trans_handle *trans;
- int ret;
- int err = 0;
+ int ret2;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -4155,15 +4206,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
while (1) {
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
path, 0, 0);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
+ ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -4174,7 +4224,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
- err = PTR_ERR(reloc_root);
+ ret = PTR_ERR(reloc_root);
goto out;
}
@@ -4186,15 +4236,12 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
- if (ret != -ENOENT) {
- err = ret;
+ if (ret != -ENOENT)
goto out;
- }
ret = mark_garbage_root(reloc_root);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+ ret = 0;
} else {
btrfs_put_root(fs_root);
}
@@ -4212,15 +4259,13 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc = alloc_reloc_control(fs_info);
if (!rc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
ret = reloc_chunk_start(fs_info);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end;
- }
rc->extent_root = btrfs_extent_root(fs_info, 0);
@@ -4228,11 +4273,11 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_unset;
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
@@ -4248,15 +4293,15 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
+ ret = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_end_transaction(trans);
goto out_unset;
}
- err = __add_reloc_root(reloc_root);
- ASSERT(err != -EEXIST);
- if (err) {
+ ret = __add_reloc_root(reloc_root);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
btrfs_end_transaction(trans);
@@ -4266,8 +4311,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_root);
}
- err = btrfs_commit_transaction(trans);
- if (err)
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
goto out_unset;
merge_reloc_roots(rc);
@@ -4276,14 +4321,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_clean;
}
- err = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
out_clean:
- ret = clean_dirty_subvols(rc);
- if (ret < 0 && !err)
- err = ret;
+ ret2 = clean_dirty_subvols(rc);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
out_unset:
unset_reloc_control(rc);
out_end:
@@ -4294,14 +4339,14 @@ out:
btrfs_free_path(path);
- if (err == 0) {
+ if (ret == 0) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
- err = btrfs_orphan_cleanup(fs_root);
+ ret = btrfs_orphan_cleanup(fs_root);
btrfs_put_root(fs_root);
}
- return err;
+ return ret;
}
/*
@@ -4312,18 +4357,20 @@ out:
*/
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
+ u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start;
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
LIST_HEAD(list);
int ret;
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + ordered->num_bytes - 1,
- &list, 0, false);
- if (ret)
+ &list, false);
+ if (ret < 0) {
+ btrfs_mark_ordered_extent_error(ordered);
return ret;
+ }
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
@@ -4373,13 +4420,22 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
btrfs_root_last_snapshot(&root->root_item))
first_cow = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
- rc->create_reloc_tree) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
- BUG_ON(node->bytenr != buf->start &&
- node->new_bytenr != buf->start);
+
+ /*
+ * If node->bytenr != buf->start and node->new_bytenr !=
+ * buf->start then we've got the wrong backref node for what we
+ * expected to see here and the cache is incorrect.
+ */
+ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+ btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+ buf->start, node->bytenr, node->new_bytenr);
+ return -EUCLEAN;
+ }
btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
@@ -4467,8 +4523,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
}
new_root = pending->snap;
- reloc_root = create_reloc_root(trans, root->reloc_root,
- new_root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5fb60f2deb53..788c86d8633a 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_RELOCATION_H
#define BTRFS_RELOCATION_H
+#include <linux/types.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ordered_extent;
+struct btrfs_pending_snapshot;
+
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index aac18f620de4..33962671a96c 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -10,7 +10,6 @@
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
@@ -51,7 +50,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
}
/*
- * btrfs_find_root - lookup the root by the key.
+ * Lookup the root by the key.
+ *
* root: the root of the root tree
* search_key: the key to search
* path: the path we search
@@ -81,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
if (ret > 0)
goto out;
} else {
- BUG_ON(ret == 0); /* Logical error */
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of the valid range.
+ */
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
@@ -141,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret > 0) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
- key->objectid, key->type, key->offset,
- root->root_key.objectid);
+ key->objectid, key->type, key->offset, btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -322,8 +328,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
-
- BUG_ON(ret != 0);
+ if (ret != 0) {
+ /* The root must exist but we did not find it by the key. */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
out:
@@ -485,7 +494,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * Reserve space for subvolume operation.
+ *
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
@@ -508,7 +518,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ if (btrfs_qgroup_enabled(fs_info)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index cce808b44cc0..8f5739e732b9 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,6 +3,18 @@
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
+#include <linux/types.h>
+
+struct fscrypt_str;
+struct extent_buffer;
+struct btrfs_key;
+struct btrfs_root;
+struct btrfs_root_item;
+struct btrfs_path;
+struct btrfs_fs_info;
+struct btrfs_block_rsv;
+struct btrfs_trans_handle;
+
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
@@ -16,10 +28,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_key *key,
- struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item);
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a2d91d9f8a10..3fcc7c092c5e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,7 +16,6 @@
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
-#include "check-integrity.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
@@ -24,6 +23,7 @@
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
+#include "raid-stripe-tree.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -43,7 +43,7 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
#define SCRUB_STRIPES_PER_GROUP 8
@@ -153,12 +153,14 @@ struct scrub_stripe {
unsigned int init_nr_io_errors;
unsigned int init_nr_csum_errors;
unsigned int init_nr_meta_errors;
+ unsigned int init_nr_meta_gen_errors;
/*
* The following error bitmaps are all for the current status.
* Every time we submit a new read, these bitmaps may be updated.
*
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+ * error_bitmap = io_error_bitmap | csum_error_bitmap |
+ * meta_error_bitmap | meta_generation_bitmap;
*
* IO and csum errors can happen for both metadata and data.
*/
@@ -166,6 +168,7 @@ struct scrub_stripe {
unsigned long io_error_bitmap;
unsigned long csum_error_bitmap;
unsigned long meta_error_bitmap;
+ unsigned long meta_gen_error_bitmap;
/* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
@@ -192,7 +195,6 @@ struct scrub_ctx {
int cur_stripe;
atomic_t cancel_req;
int readonly;
- int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -262,7 +264,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
if (ret < 0)
goto error;
@@ -617,7 +619,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
memcpy(on_disk_csum, header->csum, fs_info->csum_size);
if (logical != btrfs_stack_header_bytenr(header)) {
- bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
@@ -673,7 +675,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad generation, has %llu want %llu",
@@ -685,6 +687,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -710,7 +713,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
/* Metadata, verify the full tree block. */
if (sector->is_metadata) {
/*
- * Check if the tree block crosses the stripe boudary. If
+ * Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
* warning.
*
@@ -839,7 +842,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
bbio->bio.bi_iter.bi_size >= blocksize)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
if (wait)
wait_scrub_stripe_io(stripe);
bbio = NULL;
@@ -858,7 +861,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
if (wait)
wait_scrub_stripe_io(stripe);
}
@@ -884,7 +887,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/*
* Init needed infos for error reporting.
*
- * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -897,7 +900,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
ASSERT(stripe->mirror_num >= 1);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
- NULL, NULL, 1);
+ NULL, NULL);
/*
* If we failed, dev will be NULL, and later detailed reports
* will just be skipped.
@@ -973,8 +976,22 @@ skip:
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < stripe->init_nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < stripe->init_nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < stripe->init_nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
@@ -983,7 +1000,8 @@ skip:
sctx->stat.no_csum += nr_nodatacsum_sectors;
sctx->stat.read_errors += stripe->init_nr_io_errors;
sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.verify_errors += stripe->init_nr_meta_errors +
+ stripe->init_nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
@@ -1029,6 +1047,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
stripe->nr_sectors);
stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
stripe->nr_sectors);
+ stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap,
+ stripe->nr_sectors);
if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
goto out;
@@ -1143,6 +1163,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1290,7 +1313,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset,
+ struct btrfs_chunk_map *map, u64 *offset,
u64 *stripe_start)
{
int i;
@@ -1391,8 +1414,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ btrfs_release_path(path);
+ return -EUCLEAN;
+ }
- ASSERT(ret > 0);
/*
* Here we intentionally pass 0 as @min_objectid, as there could be
* an extent item starting before @search_start.
@@ -1419,14 +1449,11 @@ search_forward:
if (ret > 0)
break;
next:
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret) {
- /* Either no more item or fatal error */
- btrfs_release_path(path);
- return ret;
- }
+ ret = btrfs_next_item(extent_root, path);
+ if (ret) {
+ /* Either no more items or a fatal error. */
+ btrfs_release_path(path);
+ return ret;
}
}
btrfs_release_path(path);
@@ -1505,10 +1532,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
stripe->init_nr_io_errors = 0;
stripe->init_nr_csum_errors = 0;
stripe->init_nr_meta_errors = 0;
+ stripe->init_nr_meta_gen_errors = 0;
stripe->error_bitmap = 0;
stripe->io_error_bitmap = 0;
stripe->csum_error_bitmap = 0;
stripe->meta_error_bitmap = 0;
+ stripe->meta_gen_error_bitmap = 0;
}
/*
@@ -1538,6 +1567,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
+ if (unlikely(!extent_root || !csum_root)) {
+ btrfs_err(fs_info, "no valid extent or csum root for scrub");
+ return -EUCLEAN;
+ }
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
stripe->nr_sectors);
scrub_stripe_reset_bitmaps(stripe);
@@ -1645,20 +1678,105 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
}
}
+static u32 stripe_length(const struct scrub_stripe *stripe)
+{
+ ASSERT(stripe->bg);
+
+ return min(BTRFS_STRIPE_LEN,
+ stripe->bg->start + stripe->bg->length - stripe->logical);
+}
+
+static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
+ struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
+ u64 stripe_len = BTRFS_STRIPE_LEN;
+ int mirror = stripe->mirror_num;
+ int i;
+
+ atomic_inc(&stripe->pending_io);
+
+ for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ struct page *page = scrub_stripe_get_page(stripe, i);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
+ /* The current sector cannot be merged, submit the bio. */
+ if (bbio &&
+ ((i > 0 &&
+ !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ bbio->bio.bi_iter.bi_size >= stripe_len)) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bbio(bbio, mirror);
+ bbio = NULL;
+ }
+
+ if (!bbio) {
+ struct btrfs_io_stripe io_stripe = {};
+ struct btrfs_io_context *bioc = NULL;
+ const u64 logical = stripe->logical +
+ (i << fs_info->sectorsize_bits);
+ int err;
+
+ io_stripe.rst_search_commit_root = true;
+ stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+ /*
+ * For RST cases, we need to manually split the bbio to
+ * follow the RST boundary.
+ */
+ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &stripe_len, &bioc, &io_stripe, &mirror);
+ btrfs_put_bioc(bioc);
+ if (err < 0) {
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ continue;
+ }
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ }
+
+ __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ }
+
+ if (bbio) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bbio(bbio, mirror);
+ }
+
+ if (atomic_dec_and_test(&stripe->pending_io)) {
+ wake_up(&stripe->io_wait);
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ }
+}
+
static void scrub_submit_initial_read(struct scrub_ctx *sctx,
struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
- unsigned int nr_sectors = min_t(u64, BTRFS_STRIPE_LEN, stripe->bg->start +
- stripe->bg->length - stripe->logical) >>
- fs_info->sectorsize_bits;
+ unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
ASSERT(stripe->mirror_num > 0);
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+ if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
+ scrub_submit_extent_sector_read(sctx, stripe);
+ return;
+ }
+
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
@@ -1688,7 +1806,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
mirror = calc_next_mirror(mirror, num_copies);
}
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
}
static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
@@ -1761,7 +1879,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (sctx->is_dev_replace) {
/*
* For dev-replace, if we know there is something wrong with
- * metadata, we should immedately abort.
+ * metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1787,6 +1905,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
stripe = &sctx->stripes[i];
wait_scrub_stripe_io(stripe);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.last_physical = stripe->physical + stripe_length(stripe);
+ spin_unlock(&sctx->stat_lock);
scrub_reset_stripe(stripe);
}
out:
@@ -1843,7 +1964,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
DECLARE_COMPLETION_ONSTACK(io_done);
@@ -1969,7 +2090,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc, NULL, NULL, 1);
+ &length, &bioc, NULL, NULL);
if (ret < 0) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -2012,7 +2133,7 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2055,7 +2176,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
cur_physical, &found_logical);
if (ret > 0) {
/* No more extent, just update the accounting */
+ spin_lock(&sctx->stat_lock);
sctx->stat.last_physical = physical + logical_length;
+ spin_unlock(&sctx->stat_lock);
ret = 0;
break;
}
@@ -2073,7 +2196,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2082,7 +2205,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
}
/* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
struct btrfs_block_group *bg,
int stripe_index)
{
@@ -2099,7 +2222,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
}
/* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2111,7 +2234,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
static int scrub_simple_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct btrfs_device *device,
int stripe_index)
{
@@ -2144,18 +2267,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct extent_map *em,
+ struct btrfs_chunk_map *map,
struct btrfs_device *scrub_dev,
int stripe_index)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
int ret2;
u64 physical = map->stripes[stripe_index].physical;
- const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
@@ -2253,6 +2375,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
stripe_logical += chunk_logical;
ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
map, stripe_logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN,
+ physical_end);
+ spin_unlock(&sctx->stat_lock);
if (ret)
goto out;
goto next;
@@ -2318,17 +2444,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
int i;
int ret = 0;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, bg->start, bg->length);
- read_unlock(&map_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+ if (!map) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
@@ -2340,22 +2461,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- if (em->start != bg->start)
+ if (map->start != bg->start)
goto out;
- if (em->len < dev_extent_len)
+ if (map->chunk_len < dev_extent_len)
goto out;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
if (ret)
goto out;
}
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2364,19 +2484,15 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_trans_handle *trans;
if (!btrfs_is_zoned(fs_info))
return 0;
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(root);
}
static noinline_for_stack
@@ -2607,8 +2723,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
if (sctx->is_dev_replace) {
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
- cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
}
scrub_pause_off(fs_info);
@@ -2736,7 +2851,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
else
- gen = fs_info->last_trans_committed;
+ gen = btrfs_get_last_trans_committed(fs_info);
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr);
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf9d..f0df597b75c7 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -3,6 +3,12 @@
#ifndef BTRFS_SCRUB_H
#define BTRFS_SCRUB_H
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_scrub_progress;
+
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa1e6d88a72c..c843b4aefb8a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,7 +25,6 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
-#include "xattr.h"
#include "print-tree.h"
#include "accessors.h"
#include "dir-item.h"
@@ -63,7 +62,7 @@ struct fs_path {
/*
* Average path length does not exceed 200 bytes, we'll have
* better packing in the slab and higher chance to satisfy
- * a allocation later during send.
+ * an allocation later during send.
*/
char pad[256];
};
@@ -347,8 +346,10 @@ struct name_cache_entry {
u64 parent_gen;
int ret;
int need_later_update;
+ /* Name length without NUL terminator. */
int name_len;
- char name[];
+ /* Not NUL terminated. */
+ char name[] __counted_by(name_len) __nonstring;
};
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
@@ -393,9 +394,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
btrfs_err(sctx->send_root->fs_info,
"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
result_string, what, sctx->cmp_key->objectid,
- sctx->send_root->root_key.objectid,
- (sctx->parent_root ?
- sctx->parent_root->root_key.objectid : 0));
+ btrfs_root_id(sctx->send_root),
+ (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0));
}
__maybe_unused
@@ -487,10 +487,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
path_len = p->end - p->start;
old_buf_len = p->buf_len;
@@ -801,7 +799,7 @@ static int send_cmd(struct send_ctx *sctx)
put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
put_unaligned_le32(0, &hdr->crc);
- crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -1138,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Start with a small buffer (1 page). If later we end up needing more
* space, which can happen for xattrs on a fs with a leaf size greater
- * then the page size, attempt to increase the buffer. Typically xattr
+ * than the page size, attempt to increase the buffer. Typically xattr
* values are small.
*/
buf_len = PATH_MAX;
@@ -1317,9 +1315,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
const struct clone_root *cr = elt;
- if (root < cr->root->root_key.objectid)
+ if (root < btrfs_root_id(cr->root))
return -1;
- if (root > cr->root->root_key.objectid)
+ if (root > btrfs_root_id(cr->root))
return 1;
return 0;
}
@@ -1329,9 +1327,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
const struct clone_root *cr1 = e1;
const struct clone_root *cr2 = e2;
- if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root))
return -1;
- if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root))
return 1;
return 0;
}
@@ -1419,7 +1417,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
+ if (sctx->backref_cache.size == 0)
return false;
/*
@@ -1517,7 +1515,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
+ if (sctx->backref_cache.size == 1)
sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
@@ -1779,7 +1777,7 @@ static int read_symlink(struct btrfs_root *root,
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
- ino, root->root_key.objectid);
+ ino, btrfs_root_id(root));
ret = -EIO;
goto out;
}
@@ -2390,7 +2388,7 @@ out_cache:
/*
* Store the result of the lookup in the name cache.
*/
- nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
+ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
if (!nce) {
ret = -ENOMEM;
goto out;
@@ -2402,7 +2400,7 @@ out_cache:
nce->parent_gen = *parent_gen;
nce->name_len = fs_path_len(dest);
nce->ret = ret;
- strcpy(nce->name, dest->start);
+ memcpy(nce->name, dest->start, nce->name_len);
if (ino < sctx->send_progress)
nce->need_later_update = 0;
@@ -2533,7 +2531,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->root_key.objectid;
+ key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2549,7 +2547,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->root_key.objectid) {
+ key.objectid != btrfs_root_id(send_root)) {
ret = -ENOENT;
goto out;
}
@@ -2822,8 +2820,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
static int trim_dir_utimes_cache(struct send_ctx *sctx)
{
- while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
- SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
+ while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
struct btrfs_lru_cache_entry *lru;
int ret;
@@ -5191,11 +5188,10 @@ out:
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct inode *inode;
struct fs_path *p;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5276,10 +5272,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct page *page;
+ struct folio *folio;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
+ struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
ret = put_data_header(sctx, len);
@@ -5292,44 +5289,50 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(sctx->cur_inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+again:
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ page_cache_sync_readahead(mapping,
&sctx->ra, NULL, index,
last_index + 1 - index);
- page = find_or_create_page(sctx->cur_inode->i_mapping,
- index, GFP_KERNEL);
- if (!page) {
- ret = -ENOMEM;
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
break;
}
}
- if (PageReadahead(page))
- page_cache_async_readahead(sctx->cur_inode->i_mapping,
- &sctx->ra, NULL, page_folio(page),
- index, last_index + 1 - index);
+ WARN_ON(folio_order(folio));
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
+ if (folio_test_readahead(folio))
+ page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
+ last_index + 1 - index);
+
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
- page_offset(page), sctx->cur_ino,
- sctx->send_root->root_key.objectid);
- put_page(page);
+ folio_pos(folio), sctx->cur_ino,
+ btrfs_root_id(sctx->send_root));
+ folio_put(folio);
ret = -EIO;
break;
}
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
- memcpy_from_page(sctx->send_buf + sctx->send_size, page,
- pg_offset, cur_len);
- unlock_page(page);
- put_page(page);
+ memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
+ pg_offset, cur_len);
+ folio_unlock(folio);
+ folio_put(folio);
index++;
pg_offset = 0;
len -= cur_len;
@@ -5390,7 +5393,7 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->root_key.objectid,
+ offset, len, btrfs_root_id(clone_root->root),
clone_root->ino, clone_root->offset);
p = fs_path_alloc();
@@ -5552,7 +5555,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5619,7 +5622,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
u32 crc;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5678,7 +5681,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT));
@@ -5688,8 +5691,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
hdr->crc = 0;
- crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
- crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ crc = crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5748,7 +5751,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(sctx->cur_inode)) {
int err = PTR_ERR(sctx->cur_inode);
@@ -6524,21 +6527,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
return 0;
- if (sctx->cur_inode_last_extent == (u64)-1) {
- ret = get_last_extent(sctx, key->offset - 1);
- if (ret)
- return ret;
- }
-
- if (path->slots[0] == 0 &&
- sctx->cur_inode_last_extent < key->offset) {
- /*
- * We might have skipped entire leafs that contained only
- * file extent items for our current inode. These leafs have
- * a generation number smaller (older) than the one in the
- * current leaf and the leaf our last extent came from, and
- * are located between these 2 leafs.
- */
+ /*
+ * Get last extent's end offset (exclusive) if we haven't determined it
+ * yet (we're processing the first file extent item that is new), or if
+ * we're at the first slot of a leaf and the last extent's end is less
+ * than the current extent's offset, because we might have skipped
+ * entire leaves that contained only file extent items for our current
+ * inode. These leaves have a generation number smaller (older) than the
+ * one in the current leaf and the leaf our last extent came from, and
+ * are located between these 2 leaves.
+ */
+ if ((sctx->cur_inode_last_extent == (u64)-1) ||
+ (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) {
ret = get_last_extent(sctx, key->offset - 1);
if (ret)
return ret;
@@ -7194,13 +7194,11 @@ static int changed_extent(struct send_ctx *sctx,
static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
{
- int ret = 0;
-
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result == BTRFS_COMPARE_TREE_NEW)
sctx->cur_inode_needs_verity = true;
}
- return ret;
+ return 0;
}
static int dir_changed(struct send_ctx *sctx, u64 dir)
@@ -7389,7 +7387,7 @@ static int search_key_again(const struct send_ctx *sctx,
"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
key->objectid, key->type, key->offset,
(root == sctx->parent_root ? "parent" : "send"),
- root->root_key.objectid, path->lowest_level,
+ btrfs_root_id(root), path->lowest_level,
path->slots[path->lowest_level]);
return -EUCLEAN;
}
@@ -8050,34 +8048,18 @@ out:
*/
static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
{
- int i;
- struct btrfs_trans_handle *trans = NULL;
-
-again:
- if (sctx->parent_root &&
- sctx->parent_root->node != sctx->parent_root->commit_root)
- goto commit_trans;
-
- for (i = 0; i < sctx->clone_roots_cnt; i++)
- if (sctx->clone_roots[i].root->node !=
- sctx->clone_roots[i].root->commit_root)
- goto commit_trans;
+ struct btrfs_root *root = sctx->parent_root;
- if (trans)
- return btrfs_end_transaction(trans);
+ if (root && root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
- return 0;
-
-commit_trans:
- /* Use any root, all fs roots will get their commit roots updated. */
- if (!trans) {
- trans = btrfs_join_transaction(sctx->send_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- goto again;
+ for (int i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+ if (root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
}
- return btrfs_commit_transaction(trans);
+ return 0;
}
/*
@@ -8098,7 +8080,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -8106,7 +8088,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
return 0;
@@ -8123,7 +8105,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
"send_in_progress unbalanced %d root %llu",
- root->send_in_progress, root->root_key.objectid);
+ root->send_in_progress, btrfs_root_id(root));
spin_unlock(&root->root_item_lock);
}
@@ -8131,13 +8113,13 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
{
btrfs_warn_rl(root->fs_info,
"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
- root->root_key.objectid, root->dedupe_in_progress);
+ btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(inode)->root;
+ struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4f5509cb1803..b07f4aa66878 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -8,6 +8,11 @@
#define BTRFS_SEND_H
#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/align.h>
+
+struct btrfs_inode;
+struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
@@ -25,9 +30,6 @@
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
-struct inode;
-struct btrfs_ioctl_send_args;
-
enum btrfs_tlv_type {
BTRFS_TLV_U8,
BTRFS_TLV_U16,
@@ -180,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 581bdd709ee0..d5a9cd8a4fd8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include "linux/spinlock.h"
+#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
@@ -9,7 +11,6 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -162,7 +163,7 @@
* thing with or without extra unallocated space.
*/
-u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
@@ -191,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
+
/*
* Calculate chunk size depending on volume type (regular or zoned).
*/
@@ -233,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
+ space_info->fs_info = info;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -341,12 +345,35 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo;
+ u64 data_chunk_size;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no
+ * more than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
+ * as it is.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ if (btrfs_is_zoned(fs_info))
+ return data_sinfo->chunk_size;
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ return min_t(u64, data_chunk_size, SZ_1G);
+}
+
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+ const struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
+ u64 data_chunk_size;
int factor;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +391,27 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
*/
factor = btrfs_bg_type_to_factor(profile);
avail = div_u64(avail, factor);
+ if (avail == 0)
+ return 0;
+
+ data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /*
+ * Since data allocations immediately use block groups as part of the
+ * reservation, because we assume that data reservations will == actual
+ * usage, we could potentially overcommit and then immediately have that
+ * available space used by a data allocation, which could put us in a
+ * bind when we get close to filling the file system.
+ *
+ * To handle this simply remove the data_chunk_size from the available
+ * space. If we are relatively empty this won't affect our ability to
+ * overcommit much, and if we're very close to full it'll keep us from
+ * getting into a position where we've given ourselves very little
+ * metadata wiggle room.
+ */
+ if (avail <= data_chunk_size)
+ return 0;
+ avail -= data_chunk_size;
/*
* If we aren't flushing all things, let us overcommit up to
@@ -374,11 +422,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
+ const struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 avail;
@@ -483,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
-static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info)
+static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *info)
{
const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
@@ -555,20 +614,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
- u64 to_reclaim)
-{
- const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
- u64 nr;
-
- nr = div64_u64(to_reclaim, bytes);
- if (!nr)
- nr = 1;
- return nr;
-}
-
-#define EXTENT_SIZE_PER_ITEM SZ_256K
-
/*
* shrink metadata reservation for delalloc
*/
@@ -668,7 +713,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, NULL);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -748,10 +793,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_delayed_refs_nr(fs_info, num_bytes);
+ btrfs_run_delayed_refs(trans, num_bytes);
else
- nr = 0;
- btrfs_run_delayed_refs(trans, nr);
+ btrfs_run_delayed_refs(trans, 0);
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
@@ -788,14 +832,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* because that does not wait for a transaction to fully commit
* (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- break;
- }
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
break;
default:
ret = -ENOSPC;
@@ -807,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
return;
}
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
@@ -834,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
}
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+ const struct btrfs_space_info *space_info)
{
const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
@@ -977,7 +1013,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
}
/*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
* @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
@@ -1741,7 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
- * @block_rsv: block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
@@ -1753,21 +1790,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* space already.
*/
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- block_rsv->space_info->flags,
- orig_bytes, 1);
+ space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, block_rsv->space_info,
- orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
}
return ret;
}
@@ -1850,3 +1885,202 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
+
+static u64 calc_pct_ratio(u64 x, u64 y)
+{
+ int err;
+
+ if (!y)
+ return 0;
+again:
+ err = check_mul_overflow(100, x, &x);
+ if (err)
+ goto lose_precision;
+ return div64_u64(x, y);
+lose_precision:
+ x >>= 10;
+ y >>= 10;
+ if (!y)
+ y = 1;
+ goto again;
+}
+
+/*
+ * A reasonable buffer for unallocated space is 10 data block_groups.
+ * If we claw this back repeatedly, we can still achieve efficient
+ * utilization when near full, and not do too much reclaim while
+ * always maintaining a solid buffer for workloads that quickly
+ * allocate and pressure the unallocated space.
+ */
+static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
+
+ return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
+}
+
+/*
+ * The fundamental goal of automatic reclaim is to protect the filesystem's
+ * unallocated space and thus minimize the probability of the filesystem going
+ * read only when a metadata allocation failure causes a transaction abort.
+ *
+ * However, relocations happen into the space_info's unused space, therefore
+ * automatic reclaim must also back off as that space runs low. There is no
+ * value in doing trivial "relocations" of re-writing the same block group
+ * into a fresh one.
+ *
+ * Furthermore, we want to avoid doing too much reclaim even if there are good
+ * candidates. This is because the allocator is pretty good at filling up the
+ * holes with writes. So we want to do just enough reclaim to try and stay
+ * safe from running out of unallocated space but not be wasteful about it.
+ *
+ * Therefore, the dynamic reclaim threshold is calculated as follows:
+ * - calculate a target unallocated amount of 5 block group sized chunks
+ * - ratchet up the intensity of reclaim depending on how far we are from
+ * that target by using a formula of unalloc / target to set the threshold.
+ *
+ * Typically with 10 block groups as the target, the discrete values this comes
+ * out to are 0, 10, 20, ... , 80, 90, and 99.
+ */
+static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 target = calc_unalloc_target(fs_info);
+ u64 alloc = space_info->total_bytes;
+ u64 used = btrfs_space_info_used(space_info, false);
+ u64 unused = alloc - used;
+ u64 want = target > unalloc ? target - unalloc : 0;
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /* If we have no unused space, don't bother, it won't work anyway. */
+ if (unused < data_chunk_size)
+ return 0;
+
+ /* Cast to int is OK because want <= target. */
+ return calc_pct_ratio(want, target);
+}
+
+int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info)
+{
+ lockdep_assert_held(&space_info->lock);
+
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return calc_dynamic_reclaim_threshold(space_info);
+ return READ_ONCE(space_info->bg_reclaim_threshold);
+}
+
+/*
+ * Under "urgent" reclaim, we will reclaim even fresh block groups that have
+ * recently seen successful allocations, as we are desperate to reclaim
+ * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
+ */
+static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ return unalloc < data_chunk_size;
+}
+
+static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, int raid)
+{
+ struct btrfs_block_group *bg;
+ int thresh_pct;
+ bool try_again = true;
+ bool urgent;
+
+ spin_lock(&space_info->lock);
+ urgent = is_reclaim_urgent(space_info);
+ thresh_pct = btrfs_calc_reclaim_threshold(space_info);
+ spin_unlock(&space_info->lock);
+
+ down_read(&space_info->groups_sem);
+again:
+ list_for_each_entry(bg, &space_info->block_groups[raid], list) {
+ u64 thresh;
+ bool reclaim = false;
+
+ btrfs_get_block_group(bg);
+ spin_lock(&bg->lock);
+ thresh = mult_perc(bg->length, thresh_pct);
+ if (bg->used < thresh && bg->reclaim_mark) {
+ try_again = false;
+ reclaim = true;
+ }
+ bg->reclaim_mark++;
+ spin_unlock(&bg->lock);
+ if (reclaim)
+ btrfs_mark_bg_to_reclaim(bg);
+ btrfs_put_block_group(bg);
+ }
+
+ /*
+ * In situations where we are very motivated to reclaim (low unalloc)
+ * use two passes to make the reclaim mark check best effort.
+ *
+ * If we have any staler groups, we don't touch the fresher ones, but if we
+ * really need a block group, do take a fresh one.
+ */
+ if (try_again && urgent) {
+ try_again = false;
+ goto again;
+ }
+
+ up_read(&space_info->groups_sem);
+}
+
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
+
+ lockdep_assert_held(&space_info->lock);
+ space_info->reclaimable_bytes += bytes;
+
+ if (space_info->reclaimable_bytes >= chunk_sz)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
+}
+
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
+{
+ lockdep_assert_held(&space_info->lock);
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return;
+ if (ready != space_info->periodic_reclaim_ready) {
+ space_info->periodic_reclaim_ready = ready;
+ if (!ready)
+ space_info->reclaimable_bytes = 0;
+ }
+}
+
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+{
+ bool ret;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return false;
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return false;
+
+ spin_lock(&space_info->lock);
+ ret = space_info->periodic_reclaim_ready;
+ btrfs_set_periodic_reclaim_ready(space_info, false);
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
+{
+ int raid;
+ struct btrfs_space_info *space_info;
+
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ if (!btrfs_should_periodic_reclaim(space_info))
+ continue;
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
+ do_reclaim_sweep(fs_info, space_info, raid);
+ }
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 08a3bd10addc..efbecc0c5258 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,8 +3,18 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include <trace/events/btrfs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/lockdep.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
#include "volumes.h"
+struct btrfs_fs_info;
+struct btrfs_block_group;
+
/*
* Different levels for to flush space when doing space reservations.
*
@@ -84,6 +94,7 @@ enum btrfs_flush_state {
};
struct btrfs_space_info {
+ struct btrfs_fs_info *fs_info;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -155,6 +166,47 @@ struct btrfs_space_info {
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+
+ /*
+ * Monotonically increasing counter of block group reclaim attempts
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count
+ */
+ u64 reclaim_count;
+
+ /*
+ * Monotonically increasing counter of reclaimed bytes
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes
+ */
+ u64 reclaim_bytes;
+
+ /*
+ * Monotonically increasing counter of reclaim errors
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors
+ */
+ u64 reclaim_errors;
+
+ /*
+ * If true, use the dynamic relocation threshold, instead of the
+ * fixed bg_reclaim_threshold.
+ */
+ bool dynamic_reclaim;
+
+ /*
+ * Periodically check all block groups against the reclaim
+ * threshold in the cleaner thread.
+ */
+ bool periodic_reclaim;
+
+ /*
+ * Periodic reclaim should be a no-op if a space_info hasn't
+ * freed any space since the last time we tried.
+ */
+ bool periodic_reclaim_ready;
+
+ /*
+ * Net bytes freed or allocated since the last reclaim pass.
+ */
+ s64 reclaimable_bytes;
};
struct reserve_ticket {
@@ -165,7 +217,7 @@ struct reserve_ticket {
wait_queue_head_t wait;
};
-static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
@@ -206,20 +258,20 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
+ const struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
@@ -238,4 +290,10 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
+int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
+void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index b98d42ca5564..88a01d51ab11 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,8 @@
* This means a slightly higher tree locking latency.
*/
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+#if PAGE_SIZE > SZ_4K
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
@@ -74,8 +75,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!page->mapping || !page->mapping->host ||
- is_data_inode(page->mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
return true;
/*
@@ -86,37 +86,10 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
return true;
return false;
}
-
-void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
-{
- unsigned int cur = 0;
- unsigned int nr_bits;
-
- ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
-
- nr_bits = PAGE_SIZE / sectorsize;
- subpage_info->bitmap_nr_bits = nr_bits;
-
- subpage_info->uptodate_offset = cur;
- cur += nr_bits;
-
- subpage_info->dirty_offset = cur;
- cur += nr_bits;
-
- subpage_info->writeback_offset = cur;
- cur += nr_bits;
-
- subpage_info->ordered_offset = cur;
- cur += nr_bits;
-
- subpage_info->checked_offset = cur;
- cur += nr_bits;
-
- subpage_info->total_nr_bits = cur;
-}
+#endif
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type)
+ struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
@@ -124,31 +97,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
- if (page->mapping)
- ASSERT(PageLocked(page));
+ if (folio->mapping)
+ ASSERT(folio_test_locked(folio));
- /* Either not subpage, or the page already has private attached */
- if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
- attach_page_private(page, subpage);
+ folio_attach_private(folio, subpage);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- /* Either not subpage, or already detached */
- if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
return;
- subpage = detach_page_private(page);
+ subpage = folio_detach_private(folio);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -162,18 +134,16 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(fs_info->sectorsize < PAGE_SIZE);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
spin_lock_init(&ret->lock);
- if (type == BTRFS_SUBPAGE_METADATA) {
+ if (type == BTRFS_SUBPAGE_METADATA)
atomic_set(&ret->eb_refs, 0);
- } else {
- atomic_set(&ret->readers, 0);
- atomic_set(&ret->writers, 0);
- }
+ else
+ atomic_set(&ret->nr_locked, 0);
return ret;
}
@@ -188,243 +158,251 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
* This is important for eb allocation, to prevent race with last eb freeing
* of the same page.
* With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
* allocating the extent buffer.
*/
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
atomic_inc(&subpage->eb_refs);
}
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(atomic_read(&subpage->eb_refs));
atomic_dec(&subpage->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
+ /* For subpage support, the folio must be single page. */
+ ASSERT(folio_order(folio) == 0);
+
/* Basic checks */
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
- if (page->mapping)
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ if (folio->mapping)
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
}
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const int nbits = len >> fs_info->sectorsize_bits;
-
- btrfs_subpage_assert(fs_info, page, start, len);
-
- atomic_add(nbits, &subpage->readers);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const int nbits = len >> fs_info->sectorsize_bits;
- bool is_data;
- bool last;
-
- btrfs_subpage_assert(fs_info, page, start, len);
- is_data = is_data_inode(page->mapping->host);
- ASSERT(atomic_read(&subpage->readers) >= nbits);
- last = atomic_sub_and_test(nbits, &subpage->readers);
-
- /*
- * For data we need to unlock the page if the last read has finished.
- *
- * And please don't replace @last with atomic_sub_and_test() call
- * inside if () condition.
- * As we want the atomic_sub_and_test() to be always executed.
- */
- if (is_data && last)
- unlock_page(page);
-}
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
+({ \
+ unsigned int __start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
+ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
+ __start_bit; \
+})
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
u32 orig_len = *len;
- *start = max_t(u64, page_offset(page), orig_start);
+ *start = max_t(u64, folio_pos(folio), orig_start);
/*
* For certain call sites like btrfs_drop_pages(), we may have pages
* beyond the target range. In that case, just set @len to 0, subpage
* helpers can handle @len == 0 without any problem.
*/
- if (page_offset(page) >= orig_start + orig_len)
+ if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
orig_start + orig_len) - *start;
}
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const int nbits = (len >> fs_info->sectorsize_bits);
- int ret;
-
- btrfs_subpage_assert(fs_info, page, start, len);
-
- ASSERT(atomic_read(&subpage->readers) == 0);
- ret = atomic_add_return(nbits, &subpage->writers);
- ASSERT(ret == nbits);
-}
-
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
+ unsigned long flags;
+ unsigned int cleared = 0;
+ int bit = start_bit;
+ bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
*
* This @locked_page is locked by plain lock_page(), thus its
- * subpage::writers is 0. Handle them in a special way.
+ * subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
return true;
+ }
- ASSERT(atomic_read(&subpage->writers) >= nbits);
- return atomic_sub_and_test(nbits, &subpage->writers);
+ for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
+ clear_bit(bit, subpage->bitmaps);
+ cleared++;
+ }
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
}
/*
- * Lock a page for delalloc page writeback.
+ * Handle different locked folios:
+ *
+ * - Non-subpage folio
+ * Just unlock it.
*
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
+ * - folio locked but without any subpage locked
+ * This happens either before writepage_delalloc() or the delalloc range is
+ * already handled by previous folio.
+ * We can simple unlock it.
*
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
+ * - folio locked with subpage range locked.
+ * We go through the locked sectors inside the range and clear their locked
+ * bitmap, reduce the writer lock number, and unlock the page if that's
+ * the last locked range.
*/
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
- lock_page(page);
- return 0;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+
+ ASSERT(folio_test_locked(folio));
+
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
}
- lock_page(page);
- if (!PagePrivate(page) || !page->private) {
- unlock_page(page);
- return -EAGAIN;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without locked number.
+ *
+ * Since we own the page lock, no one else could touch subpage::locked
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
+ folio_unlock(folio);
+ return;
}
- btrfs_subpage_clamp_range(page, &start, &len);
- btrfs_subpage_start_writer(fs_info, page, start, len);
- return 0;
+
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
+ folio_unlock(folio);
}
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
- btrfs_subpage_clamp_range(page, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
- unlock_page(page);
-}
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ unsigned long flags;
+ bool last = false;
+ int cleared = 0;
+ int bit;
-#define subpage_calc_start_bit(fs_info, page, name, start, len) \
-({ \
- unsigned int start_bit; \
- \
- btrfs_subpage_assert(fs_info, page, start, len); \
- start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- start_bit += fs_info->subpage_info->name##_offset; \
- start_bit; \
-})
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
+ folio_unlock(folio);
+ return;
+ }
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
+ if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ cleared++;
+ }
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ if (last)
+ folio_unlock(folio);
+}
#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->subpage_info->name##_offset, \
- fs_info->subpage_info->bitmap_nr_bits)
+ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
+ fs_info->sectors_per_page)
#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->subpage_info->name##_offset, \
- fs_info->subpage_info->bitmap_nr_bits)
+ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
+ fs_info->sectors_per_page)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
/*
@@ -438,10 +416,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
@@ -455,101 +433,102 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
bool last;
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
if (last)
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- set_page_writeback(page);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
- ASSERT(PageWriteback(page));
- end_page_writeback(page);
+ ASSERT(folio_test_writeback(folio));
+ folio_end_writeback(folio);
}
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageOrdered(page);
+ folio_set_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
- ClearPageOrdered(page);
+ folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
- SetPageChecked(page);
+ folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,10 +538,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
*/
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+ struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
@@ -584,177 +563,199 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
* in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
* back to regular sectorsize branch.
*/
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
- test_page_func) \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \
+ folio_clear_func, folio_test_func) \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- btrfs_subpage_clamp_range(page, &start, &len); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
- PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
- PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
- PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
- PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+ folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+ folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+ folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+ folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+ folio_test_checked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage;
+ unsigned int start_bit;
+ unsigned int nbits;
+ unsigned long flags;
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!PageDirty(page));
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ ASSERT(!folio_test_dirty(folio));
return;
+ }
- ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+ start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ subpage = folio_get_private(folio);
+ ASSERT(subpage);
+ spin_lock_irqsave(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
/*
- * Handle different locked pages with different page sizes:
- *
- * - Page locked by plain lock_page()
- * It should not have any subpage::writers count.
- * Can be unlocked by unlock_page().
- * This is the most common locked page for __extent_writepage() called
- * inside extent_write_cache_pages().
- * Rarer cases include the @locked_page from extent_write_locked_range().
+ * This is for folio already locked by plain lock_page()/folio_lock(), which
+ * doesn't have any subpage awareness.
*
- * - Page locked by lock_delalloc_pages()
- * There is only one caller, all pages except @locked_page for
- * extent_write_locked_range().
- * In this case, we have to call subpage helper to handle the case.
+ * This populates the involved subpage ranges so that subpage helpers can
+ * properly unlock them.
*/
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
+ unsigned long flags;
+ unsigned int start_bit;
+ unsigned int nbits;
+ int ret;
- ASSERT(PageLocked(page));
- /* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
-
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
-
- /*
- * For subpage case, there are two types of locked page. With or
- * without writers number.
- *
- * Since we own the page lock, no one else could touch subpage::writers
- * and we are safe to do several atomic operations without spinlock.
- */
- if (atomic_read(&subpage->writers) == 0)
- /* No writers, locked by plain lock_page() */
- return unlock_page(page);
+ ASSERT(folio_test_locked(folio));
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ return;
- /* Have writers, use proper subpage helper to end it */
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ subpage = folio_get_private(folio);
+ start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ spin_lock_irqsave(&subpage->lock, flags);
+ /* Target range should not yet be locked. */
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &subpage->nr_locked);
+ ASSERT(ret <= fs_info->sectors_per_page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
-#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
+#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
- const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \
+ const int sectors_per_page = fs_info->sectors_per_page; \
\
- ASSERT(bitmap_nr_bits < BITS_PER_LONG); \
+ ASSERT(sectors_per_page < BITS_PER_LONG); \
*dst = bitmap_read(subpage->bitmaps, \
- subpage_info->name##_offset, \
- bitmap_nr_bits); \
+ sectors_per_page * btrfs_bitmap_nr_##name, \
+ sectors_per_page); \
}
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
+ const u32 sectors_per_page = fs_info->sectors_per_page;
unsigned long uptodate_bitmap;
- unsigned long error_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
unsigned long checked_bitmap;
+ unsigned long locked_bitmap;
unsigned long flags;
- ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage_info);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ ASSERT(sectors_per_page > 1);
+ subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
- dump_page(page, "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
- start, len, page_offset(page),
- subpage_info->bitmap_nr_bits, &uptodate_bitmap,
- subpage_info->bitmap_nr_bits, &error_bitmap,
- subpage_info->bitmap_nr_bits, &dirty_bitmap,
- subpage_info->bitmap_nr_bits, &writeback_bitmap,
- subpage_info->bitmap_nr_bits, &ordered_bitmap,
- subpage_info->bitmap_nr_bits, &checked_bitmap);
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+ start, len, folio_pos(folio),
+ sectors_per_page, &uptodate_bitmap,
+ sectors_per_page, &dirty_bitmap,
+ sectors_per_page, &locked_bitmap,
+ sectors_per_page, &writeback_bitmap,
+ sectors_per_page, &ordered_bitmap,
+ sectors_per_page, &checked_bitmap);
+}
+
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *ret_bitmap)
+{
+ struct btrfs_subpage *subpage;
+ unsigned long flags;
+
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ ASSERT(fs_info->sectors_per_page > 1);
+ subpage = folio_get_private(folio);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..44fff1f4eac4 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -4,6 +4,12 @@
#define BTRFS_SUBPAGE_H
#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/sizes.h>
+
+struct address_space;
+struct folio;
+struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -13,29 +19,23 @@
*
* This structure records how they are organized in the bitmap:
*
- * /- uptodate_offset /- dirty_offset /- ordered_offset
+ * /- uptodate /- dirty /- ordered
* | | |
* v v v
* |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o|
- * |<- bitmap_nr_bits ->|
- * |<----------------- total_nr_bits ------------------>|
+ * |< sectors_per_page >|
+ *
+ * Unlike regular macro-like enums, here we do not go upper-case names, as
+ * these names will be utilized in various macros to define function names.
*/
-struct btrfs_subpage_info {
- /* Number of bits for each bitmap */
- unsigned int bitmap_nr_bits;
-
- /* Total number of bits for the whole bitmap */
- unsigned int total_nr_bits;
-
- /*
- * *_start indicates where the bitmap starts, the length is always
- * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
- */
- unsigned int uptodate_offset;
- unsigned int dirty_offset;
- unsigned int writeback_offset;
- unsigned int ordered_offset;
- unsigned int checked_offset;
+enum {
+ btrfs_bitmap_nr_uptodate = 0,
+ btrfs_bitmap_nr_dirty,
+ btrfs_bitmap_nr_writeback,
+ btrfs_bitmap_nr_ordered,
+ btrfs_bitmap_nr_checked,
+ btrfs_bitmap_nr_locked,
+ btrfs_bitmap_nr_max
};
/*
@@ -45,14 +45,6 @@ struct btrfs_subpage_info {
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- /*
- * Both data and metadata needs to track how many readers are for the
- * page.
- * Data relies on @readers to unlock the page when last reader finished.
- * While metadata doesn't need page unlock, it needs to prevent
- * page::private get cleared before the last end_page_read().
- */
- atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -62,8 +54,12 @@ struct btrfs_subpage {
*/
atomic_t eb_refs;
- /* Structures only used by data */
- atomic_t writers;
+ /*
+ * Structures only used by data,
+ *
+ * How many sectors inside the page is locked.
+ */
+ atomic_t nr_locked;
};
unsigned long bitmaps[];
};
@@ -73,71 +69,67 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+#if PAGE_SIZE > SZ_4K
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
+#else
+static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
+ struct address_space *mapping)
+{
+ return false;
+}
+#endif
-void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page);
+ struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap);
/*
* Template for subpage related operations.
*
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
*
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
*
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -145,14 +137,28 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
+/*
+ * Helper for error cleanup, where a folio will have its dirty flag cleared,
+ * with writeback started and finished.
+ */
+static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info,
+ struct folio *locked_folio,
+ u64 start, u32 len)
+{
+ btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+}
+
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *ret_bitmap);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e33587a81409..6119a06b0569 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,19 +26,20 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
+#include <linux/security.h>
+#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
+#include "direct-io.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
#include "export.h"
#include "compression.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
@@ -63,19 +64,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
@@ -85,8 +74,22 @@ static void btrfs_put_super(struct super_block *sb)
close_ctree(fs_info);
}
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+ char *subvol_name;
+ u64 subvol_objectid;
+ u64 max_inline;
+ u32 commit_interval;
+ u32 metadata_ratio;
+ u32 thread_pool_size;
+ unsigned long long mount_opt;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ refcount_t refs;
+};
+
enum {
- Opt_acl, Opt_noacl,
+ Opt_acl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
@@ -96,48 +99,38 @@ enum {
Opt_degraded,
Opt_device,
Opt_fatal_errors,
- Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_flushoncommit,
Opt_max_inline,
- Opt_barrier, Opt_nobarrier,
- Opt_datacow, Opt_nodatacow,
- Opt_datasum, Opt_nodatasum,
- Opt_defrag, Opt_nodefrag,
- Opt_discard, Opt_nodiscard,
+ Opt_barrier,
+ Opt_datacow,
+ Opt_datasum,
+ Opt_defrag,
+ Opt_discard,
Opt_discard_mode,
- Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
- Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache,
Opt_space_cache_version,
- Opt_ssd, Opt_nossd,
- Opt_ssd_spread, Opt_nossd_spread,
+ Opt_ssd,
+ Opt_ssd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
- Opt_treelog, Opt_notreelog,
+ Opt_treelog,
Opt_user_subvol_rm_allowed,
+ Opt_norecovery,
/* Rescue options */
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
- Opt_ignorebadroots,
- Opt_ignoredatacsums,
- Opt_rescue_all,
-
- /* Deprecated options */
- Opt_recovery,
- Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
- Opt_check_integrity,
- Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask,
- Opt_enospc_debug, Opt_noenospc_debug,
+ Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
- Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
@@ -145,826 +138,651 @@ enum {
Opt_err,
};
-static const match_table_t tokens = {
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_clear_cache, "clear_cache"},
- {Opt_commit_interval, "commit=%u"},
- {Opt_compress, "compress"},
- {Opt_compress_type, "compress=%s"},
- {Opt_compress_force, "compress-force"},
- {Opt_compress_force_type, "compress-force=%s"},
- {Opt_degraded, "degraded"},
- {Opt_device, "device=%s"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_flushoncommit, "flushoncommit"},
- {Opt_noflushoncommit, "noflushoncommit"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_datacow, "datacow"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datasum, "datasum"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_discard, "discard"},
- {Opt_discard_mode, "discard=%s"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_norecovery, "norecovery"},
- {Opt_ratio, "metadata_ratio=%u"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_skip_balance, "skip_balance"},
- {Opt_space_cache, "space_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_space_cache_version, "space_cache=%s"},
- {Opt_ssd, "ssd"},
- {Opt_nossd, "nossd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd_spread, "nossd_spread"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvol_empty, "subvol="},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_thread_pool, "thread_pool=%u"},
- {Opt_treelog, "treelog"},
- {Opt_notreelog, "notreelog"},
- {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+ Opt_fatal_errors_panic,
+ Opt_fatal_errors_bug,
+};
- /* Rescue options */
- {Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+ { "panic", Opt_fatal_errors_panic },
+ { "bug", Opt_fatal_errors_bug },
+ {}
+};
+
+enum {
+ Opt_discard_sync,
+ Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+ { "sync", Opt_discard_sync },
+ { "async", Opt_discard_async },
+ {}
+};
+
+enum {
+ Opt_space_cache_v1,
+ Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+ { "v1", Opt_space_cache_v1 },
+ { "v2", Opt_space_cache_v2 },
+ {}
+};
+
+enum {
+ Opt_rescue_usebackuproot,
+ Opt_rescue_nologreplay,
+ Opt_rescue_ignorebadroots,
+ Opt_rescue_ignoredatacsums,
+ Opt_rescue_ignoremetacsums,
+ Opt_rescue_ignoresuperflags,
+ Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+ { "usebackuproot", Opt_rescue_usebackuproot },
+ { "nologreplay", Opt_rescue_nologreplay },
+ { "ignorebadroots", Opt_rescue_ignorebadroots },
+ { "ibadroots", Opt_rescue_ignorebadroots },
+ { "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "ignoremetacsums", Opt_rescue_ignoremetacsums},
+ { "ignoresuperflags", Opt_rescue_ignoresuperflags},
+ { "idatacsums", Opt_rescue_ignoredatacsums },
+ { "imetacsums", Opt_rescue_ignoremetacsums},
+ { "isuperflags", Opt_rescue_ignoresuperflags},
+ { "all", Opt_rescue_parameter_all },
+ {}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+ Opt_fragment_parameter_data,
+ Opt_fragment_parameter_metadata,
+ Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+ { "data", Opt_fragment_parameter_data },
+ { "metadata", Opt_fragment_parameter_metadata },
+ { "all", Opt_fragment_parameter_all },
+ {}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("autodefrag", Opt_defrag),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("clear_cache", Opt_clear_cache),
+ fsparam_u32("commit", Opt_commit_interval),
+ fsparam_flag("compress", Opt_compress),
+ fsparam_string("compress", Opt_compress_type),
+ fsparam_flag("compress-force", Opt_compress_force),
+ fsparam_string("compress-force", Opt_compress_force_type),
+ fsparam_flag_no("datacow", Opt_datacow),
+ fsparam_flag_no("datasum", Opt_datasum),
+ fsparam_flag("degraded", Opt_degraded),
+ fsparam_string("device", Opt_device),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+ fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+ fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+ fsparam_string("max_inline", Opt_max_inline),
+ fsparam_u32("metadata_ratio", Opt_ratio),
+ fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+ fsparam_flag("skip_balance", Opt_skip_balance),
+ fsparam_flag_no("space_cache", Opt_space_cache),
+ fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+ fsparam_flag_no("ssd", Opt_ssd),
+ fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+ fsparam_string("subvol", Opt_subvol),
+ fsparam_flag("subvol=", Opt_subvol_empty),
+ fsparam_u64("subvolid", Opt_subvolid),
+ fsparam_u32("thread_pool", Opt_thread_pool),
+ fsparam_flag_no("treelog", Opt_treelog),
+ fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+ /* Rescue options. */
+ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
/* Deprecated, with alias rescue=nologreplay */
- {Opt_nologreplay, "nologreplay"},
+ __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
- {Opt_usebackuproot, "usebackuproot"},
+ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
+ /* For compatibility only, alias for "rescue=nologreplay". */
+ fsparam_flag("norecovery", Opt_norecovery),
- /* Deprecated options */
- {Opt_recovery, "recovery"},
-
- /* Debugging options */
- {Opt_check_integrity, "check_int"},
- {Opt_check_integrity_including_extent_data, "check_int_data"},
- {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+ /* Debugging options. */
+ fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
- {Opt_fragment_data, "fragment=data"},
- {Opt_fragment_metadata, "fragment=metadata"},
- {Opt_fragment_all, "fragment=all"},
+ fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- {Opt_ref_verify, "ref_verify"},
+ fsparam_flag("ref_verify", Opt_ref_verify),
#endif
- {Opt_err, NULL},
+ {}
};
-static const match_table_t rescue_tokens = {
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_ignorebadroots, "ignorebadroots"},
- {Opt_ignorebadroots, "ibadroots"},
- {Opt_ignoredatacsums, "ignoredatacsums"},
- {Opt_ignoredatacsums, "idatacsums"},
- {Opt_rescue_all, "all"},
- {Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
- const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
- if (fs_info->mount_opt & opt) {
- btrfs_err(fs_info, "%s must be used with ro mount option",
- opt_name);
- return true;
- }
- return false;
+ return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *opts;
- char *orig;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int ret = 0;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&opts, ":")) != NULL) {
- int token;
+ switch (opt) {
+ case Opt_degraded:
+ btrfs_set_opt(ctx->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol_empty:
+ /*
+ * This exists because we used to allow it on accident, so we're
+ * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow
+ * empty subvol= again").
+ */
+ break;
+ case Opt_subvol:
+ kfree(ctx->subvol_name);
+ ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->subvol_name)
+ return -ENOMEM;
+ break;
+ case Opt_subvolid:
+ ctx->subvol_objectid = result.uint_64;
- if (!*p)
- continue;
- token = match_token(p, rescue_tokens, args);
- switch (token){
- case Opt_usebackuproot:
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
- break;
- case Opt_nologreplay:
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_ignorebadroots:
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- break;
- case Opt_ignoredatacsums:
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- break;
- case Opt_rescue_all:
- btrfs_info(info, "enabling all of the rescue options");
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_err:
- btrfs_info(info, "unrecognized rescue option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ /* subvolid=0 means give me the original fs_tree. */
+ if (!ctx->subvol_objectid)
+ ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+ break;
+ case Opt_device: {
+ struct btrfs_device *device;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(param->string, mode, false);
+ mutex_unlock(&uuid_mutex);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
+ break;
}
-out:
- kfree(orig);
- return ret;
-}
-
-/*
- * Regular mount options parser. Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p, *num;
- int intarg;
- int ret = 0;
- char *compress_type;
- bool compress_force = false;
- enum btrfs_compression_type saved_compress_type;
- int saved_compress_level;
- bool saved_compress_force;
- int no_compress = 0;
- const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
- btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (btrfs_free_space_cache_v1_active(info)) {
- if (btrfs_is_zoned(info)) {
- btrfs_info(info,
- "zoned: clearing existing space cache");
- btrfs_set_super_cache_generation(info->super_copy, 0);
+ case Opt_datasum:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
} else {
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
}
- }
-
- /*
- * Even the options are empty, we still need to do extra check
- * against new flags
- */
- if (!options)
- goto check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_degraded:
- btrfs_info(info, "allowing degraded mounts");
- btrfs_set_opt(info->mount_opt, DEGRADED);
- break;
- case Opt_subvol:
- case Opt_subvol_empty:
- case Opt_subvolid:
- case Opt_device:
- /*
- * These are parsed by btrfs_parse_subvol_options or
- * btrfs_parse_device_options and can be ignored here.
- */
- break;
- case Opt_nodatasum:
- btrfs_set_and_info(info, NODATASUM,
- "setting nodatasum");
- break;
- case Opt_datasum:
- if (btrfs_test_opt(info, NODATASUM)) {
- if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(info,
- "setting datasum, datacow enabled");
- else
- btrfs_info(info, "setting datasum");
- }
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_nodatacow:
- if (!btrfs_test_opt(info, NODATACOW)) {
- if (!btrfs_test_opt(info, COMPRESS) ||
- !btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(info,
- "setting nodatacow, compression disabled");
- } else {
- btrfs_info(info, "setting nodatacow");
- }
- }
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- btrfs_set_opt(info->mount_opt, NODATACOW);
- btrfs_set_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_datacow:
- btrfs_clear_and_info(info, NODATACOW,
- "setting datacow");
- break;
- case Opt_compress_force:
- case Opt_compress_force_type:
- compress_force = true;
- fallthrough;
- case Opt_compress:
- case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(info,
- COMPRESS) ?
- info->compress_type : BTRFS_COMPRESS_NONE;
- saved_compress_force =
- btrfs_test_opt(info, FORCE_COMPRESS);
- saved_compress_level = info->compress_level;
- if (token == Opt_compress ||
- token == Opt_compress_force ||
- strncmp(args[0].from, "zlib", 4) == 0) {
- compress_type = "zlib";
-
- info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- /*
- * args[0] contains uninitialized data since
- * for these tokens we don't expect any
- * parameter.
- */
- if (token != Opt_compress &&
- token != Opt_compress_force)
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZLIB,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- no_compress = 0;
- } else if (strncmp(args[0].from, "lzo", 3) == 0) {
- compress_type = "lzo";
- info->compress_type = BTRFS_COMPRESS_LZO;
- info->compress_level = 0;
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_LZO);
- no_compress = 0;
- } else if (strncmp(args[0].from, "zstd", 4) == 0) {
- compress_type = "zstd";
- info->compress_type = BTRFS_COMPRESS_ZSTD;
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZSTD,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
- no_compress = 0;
- } else if (strncmp(args[0].from, "no", 2) == 0) {
- compress_type = "no";
- info->compress_level = 0;
- info->compress_type = 0;
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- compress_force = false;
- no_compress++;
- } else {
- btrfs_err(info, "unrecognized compression value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
-
- if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- } else {
- /*
- * If we remount from compress-force=xxx to
- * compress=xxx, we need clear FORCE_COMPRESS
- * flag, otherwise, there is no way for users
- * to disable forcible compression separately.
- */
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- }
- if (no_compress == 1) {
- btrfs_info(info, "use no compression");
- } else if ((info->compress_type != saved_compress_type) ||
- (compress_force != saved_compress_force) ||
- (info->compress_level != saved_compress_level)) {
- btrfs_info(info, "%s %s compression, level %d",
- (compress_force) ? "force" : "use",
- compress_type, info->compress_level);
- }
- compress_force = false;
- break;
- case Opt_ssd:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_ssd_spread:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_set_and_info(info, SSD_SPREAD,
- "using spread ssd allocation scheme");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_nossd:
- btrfs_set_opt(info->mount_opt, NOSSD);
- btrfs_clear_and_info(info, SSD,
- "not using ssd optimizations");
- fallthrough;
- case Opt_nossd_spread:
- btrfs_clear_and_info(info, SSD_SPREAD,
- "not using spread ssd allocation scheme");
- break;
- case Opt_barrier:
- btrfs_clear_and_info(info, NOBARRIER,
- "turning on barriers");
- break;
- case Opt_nobarrier:
- btrfs_set_and_info(info, NOBARRIER,
- "turning off barriers");
- break;
- case Opt_thread_pool:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized thread_pool value %s",
- args[0].from);
- goto out;
- } else if (intarg == 0) {
- btrfs_err(info, "invalid value 0 for thread_pool");
- ret = -EINVAL;
- goto out;
- }
- info->thread_pool_size = intarg;
- break;
- case Opt_max_inline:
- num = match_strdup(&args[0]);
- if (num) {
- info->max_inline = memparse(num, NULL);
- kfree(num);
-
- if (info->max_inline) {
- info->max_inline = min_t(u64,
- info->max_inline,
- info->sectorsize);
- }
- btrfs_info(info, "max_inline at %llu",
- info->max_inline);
- } else {
- ret = -ENOMEM;
- goto out;
- }
- break;
- case Opt_acl:
+ break;
+ case Opt_datacow:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(ctx->mount_opt, NODATACOW);
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ }
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears
+ * "force-compress" without the need to pass
+ * "compress-force=[no|none]" before specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zlib", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "lzo", 3) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zstd", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "no", 2) == 0) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_ssd:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_ssd_spread:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ if (result.uint_32 == 0) {
+ btrfs_err(NULL, "invalid value 0 for thread_pool");
+ return -EINVAL;
+ }
+ ctx->thread_pool_size = result.uint_32;
+ break;
+ case Opt_max_inline:
+ ctx->max_inline = memparse(param->string, NULL);
+ break;
+ case Opt_acl:
+ if (result.negated) {
+ fc->sb_flags &= ~SB_POSIXACL;
+ } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- btrfs_err(info, "support for ACL not compiled in!");
- ret = -EINVAL;
- goto out;
+ btrfs_err(NULL, "support for ACL not compiled in");
+ return -EINVAL;
#endif
- case Opt_noacl:
- info->sb->s_flags &= ~SB_POSIXACL;
- break;
- case Opt_notreelog:
- btrfs_set_and_info(info, NOTREELOG,
- "disabling tree log");
- break;
- case Opt_treelog:
- btrfs_clear_and_info(info, NOTREELOG,
- "enabling tree log");
- break;
- case Opt_norecovery:
- case Opt_nologreplay:
- btrfs_warn(info,
+ }
+ /*
+ * VFS limits the ability to toggle ACL on and off via remount,
+ * despite every file system allowing this. This seems to be
+ * an oversight since we all do, but it'll fail if we're
+ * remounting. So don't set the mask here, we'll check it in
+ * btrfs_reconfigure and do the toggling ourselves.
+ */
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ fc->sb_flags_mask |= SB_POSIXACL;
+ break;
+ case Opt_treelog:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+ break;
+ case Opt_nologreplay:
+ btrfs_warn(NULL,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_flushoncommit:
- btrfs_set_and_info(info, FLUSHONCOMMIT,
- "turning on flush-on-commit");
- break;
- case Opt_noflushoncommit:
- btrfs_clear_and_info(info, FLUSHONCOMMIT,
- "turning off flush-on-commit");
- break;
- case Opt_ratio:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized metadata_ratio value %s",
- args[0].from);
- goto out;
- }
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %u",
- info->metadata_ratio);
- break;
- case Opt_discard:
- case Opt_discard_mode:
- if (token == Opt_discard ||
- strcmp(args[0].from, "sync") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
- btrfs_set_and_info(info, DISCARD_SYNC,
- "turning on sync discard");
- } else if (strcmp(args[0].from, "async") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
- btrfs_set_and_info(info, DISCARD_ASYNC,
- "turning on async discard");
- } else {
- btrfs_err(info, "unrecognized discard mode value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- btrfs_clear_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD_SYNC,
- "turning off discard");
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "turning off async discard");
- btrfs_set_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_space_cache:
- case Opt_space_cache_version:
- /*
- * We already set FREE_SPACE_TREE above because we have
- * compat_ro(FREE_SPACE_TREE) set, and we aren't going
- * to allow v1 to be set for extent tree v2, simply
- * ignore this setting if we're extent tree v2.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (token == Opt_space_cache ||
- strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(info->mount_opt,
- FREE_SPACE_TREE);
- btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
- } else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(info->mount_opt,
- SPACE_CACHE);
- btrfs_set_and_info(info, FREE_SPACE_TREE,
- "enabling free space tree");
- } else {
- btrfs_err(info, "unrecognized space_cache value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- break;
- case Opt_rescan_uuid_tree:
- btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
- break;
- case Opt_no_space_cache:
- /*
- * We cannot operate without the free space tree with
- * extent tree v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info, SPACE_CACHE,
- "disabling disk space caching");
- }
- if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info, FREE_SPACE_TREE,
- "disabling free space tree");
- }
- break;
- case Opt_inode_cache:
- case Opt_noinode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and has no effect since 5.11");
- break;
- case Opt_clear_cache:
- /*
- * We cannot clear the free space tree with extent tree
- * v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- btrfs_set_and_info(info, CLEAR_CACHE,
- "force clearing of disk cache");
- break;
- case Opt_user_subvol_rm_allowed:
- btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_norecovery:
+ btrfs_info(NULL,
+"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_flushoncommit:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ else
+ btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ ctx->metadata_ratio = result.uint_32;
+ break;
+ case Opt_discard:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, NODISCARD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ }
+ break;
+ case Opt_discard_mode:
+ switch (result.uint_32) {
+ case Opt_discard_sync:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
break;
- case Opt_enospc_debug:
- btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ case Opt_discard_async:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
break;
- case Opt_noenospc_debug:
- btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ default:
+ btrfs_err(NULL, "unrecognized discard mode value %s",
+ param->key);
+ return -EINVAL;
+ }
+ btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+ break;
+ case Opt_space_cache:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ }
+ break;
+ case Opt_space_cache_version:
+ switch (result.uint_32) {
+ case Opt_space_cache_v1:
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_defrag:
- btrfs_set_and_info(info, AUTO_DEFRAG,
- "enabling auto defrag");
+ case Opt_space_cache_v2:
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_nodefrag:
- btrfs_clear_and_info(info, AUTO_DEFRAG,
- "disabling auto defrag");
+ default:
+ btrfs_err(NULL, "unrecognized space_cache value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_clear_cache:
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ else
+ btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+ else
+ btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+ break;
+ case Opt_usebackuproot:
+ btrfs_warn(NULL,
+ "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+ /* If we're loading the backup roots we can't trust the space cache. */
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+ break;
+ case Opt_fatal_errors:
+ switch (result.uint_32) {
+ case Opt_fatal_errors_panic:
+ btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_recovery:
- case Opt_usebackuproot:
- btrfs_warn(info,
- "'%s' is deprecated, use 'rescue=usebackuproot' instead",
- token == Opt_recovery ? "recovery" :
- "usebackuproot");
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ case Opt_fatal_errors_bug:
+ btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_skip_balance:
- btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ default:
+ btrfs_err(NULL, "unrecognized fatal_errors value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_commit_interval:
+ ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) {
+ btrfs_warn(NULL, "excessive commit interval %u, use with care",
+ ctx->commit_interval);
+ }
+ if (ctx->commit_interval == 0)
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ break;
+ case Opt_rescue:
+ switch (result.uint_32) {
+ case Opt_rescue_usebackuproot:
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- case Opt_check_integrity_including_extent_data:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info,
- "enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ case Opt_rescue_nologreplay:
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
- case Opt_check_integrity:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "enabling check integrity");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ case Opt_rescue_ignorebadroots:
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
break;
- case Opt_check_integrity_print_mask:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info,
- "unrecognized check_integrity_print_mask value %s",
- args[0].from);
- goto out;
- }
- info->check_integrity_print_mask = intarg;
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "check_integrity_print_mask 0x%x",
- info->check_integrity_print_mask);
+ case Opt_rescue_ignoredatacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
-#else
- case Opt_check_integrity_including_extent_data:
- case Opt_check_integrity:
- case Opt_check_integrity_print_mask:
- btrfs_err(info,
- "support for check_integrity* not compiled in!");
- ret = -EINVAL;
- goto out;
-#endif
- case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0) {
- btrfs_set_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else if (strcmp(args[0].from, "bug") == 0) {
- btrfs_clear_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else {
- btrfs_err(info, "unrecognized fatal_errors value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
+ case Opt_rescue_ignoremetacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
break;
- case Opt_commit_interval:
- intarg = 0;
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized commit_interval value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- if (intarg == 0) {
- btrfs_info(info,
- "using default commit interval %us",
- BTRFS_DEFAULT_COMMIT_INTERVAL);
- intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
- } else if (intarg > 300) {
- btrfs_warn(info, "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
+ case Opt_rescue_ignoresuperflags:
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
break;
- case Opt_rescue:
- ret = parse_rescue_options(info, args[0].from);
- if (ret < 0) {
- btrfs_err(info, "unrecognized rescue value %s",
- args[0].from);
- goto out;
- }
+ case Opt_rescue_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ default:
+ btrfs_info(NULL, "unrecognized rescue option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
- case Opt_fragment_all:
- btrfs_info(info, "fragmenting all space");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
- btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ case Opt_fragment:
+ switch (result.uint_32) {
+ case Opt_fragment_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_metadata:
- btrfs_info(info, "fragmenting metadata");
- btrfs_set_opt(info->mount_opt,
- FRAGMENT_METADATA);
+ case Opt_fragment_parameter_metadata:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_data:
- btrfs_info(info, "fragmenting data");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ case Opt_fragment_parameter_data:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
break;
+ default:
+ btrfs_info(NULL, "unrecognized fragment option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- case Opt_ref_verify:
- btrfs_info(info, "doing ref verification");
- btrfs_set_opt(info->mount_opt, REF_VERIFY);
- break;
+ case Opt_ref_verify:
+ btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+ break;
#endif
- case Opt_err:
- btrfs_err(info, "unrecognized mount option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ default:
+ btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+ return -EINVAL;
}
-check:
- /* We're read-only, don't have to check. */
- if (new_flags & SB_RDONLY)
- goto out;
- if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
- ret = -EINVAL;
-out:
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(info, "cannot disable free space tree");
- ret = -EINVAL;
- }
- if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
- ret = -EINVAL;
- }
- if (!ret)
- ret = btrfs_check_mountopts_zoned(info);
- if (!ret && !remounting) {
- if (btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
- }
- return ret;
+ return 0;
}
/*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
*/
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *device_name, *opts, *orig, *p;
- struct btrfs_device *device = NULL;
- int error = 0;
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
- lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(const struct btrfs_fs_info *fs_info,
+ unsigned long long mount_opt, unsigned long long opt,
+ const char *opt_name)
+{
+ if (mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
- if (!options)
- return 0;
+bool btrfs_check_options(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt,
+ unsigned long flags)
+{
+ bool ret = true;
- /*
- * strsep changes the string, duplicate it because btrfs_parse_options
- * gets called later
- */
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (!(flags & SB_RDONLY) &&
+ (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags")))
+ ret = false;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free-space-tree");
+ ret = false;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+ ret = false;
+ }
- if (!*p)
- continue;
+ if (btrfs_check_mountopts_zoned(info, mount_opt))
+ ret = false;
- token = match_token(p, tokens, args);
- if (token == Opt_device) {
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- device = btrfs_scan_one_device(device_name, flags);
- kfree(device_name);
- if (IS_ERR(device)) {
- error = PTR_ERR(device);
- goto out;
- }
+ if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
+ btrfs_info(info, "disk space caching is enabled");
+ btrfs_warn(info,
+"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
+ if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+ btrfs_info(info, "using free-space-tree");
}
-out:
- kfree(orig);
- return error;
+ return ret;
}
/*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree(). We need to pre-load
+ * the mount options with the on-disk settings. Before the new mount API took
+ * effect we would do this on mount and remount. With the new mount API we'll
+ * only do this on the initial mount.
*
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options. If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
*/
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
- u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
- int error = 0;
- u64 subvolid;
-
- if (!options)
- return 0;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info,
+ "forcing free space tree for sector size %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ }
+ }
/*
- * strsep changes the string, duplicate it because
- * btrfs_parse_device_options gets called later
+ * At this point our mount options are populated, so we only mess with
+ * these settings if we don't have any settings already.
*/
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ return;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ if (btrfs_is_zoned(fs_info) &&
+ btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_info(fs_info, "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+ return;
+ }
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_subvol:
- kfree(*subvol_name);
- *subvol_name = match_strdup(&args[0]);
- if (!*subvol_name) {
- error = -ENOMEM;
- goto out;
- }
- break;
- case Opt_subvolid:
- error = match_u64(&args[0], &subvolid);
- if (error)
- goto out;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ return;
- /* we want the original fs_tree */
- if (subvolid == 0)
- subvolid = BTRFS_FS_TREE_OBJECTID;
+ if (btrfs_test_opt(fs_info, NOSPACECACHE))
+ return;
- *subvol_objectid = subvolid;
- break;
- default:
- break;
- }
- }
+ /*
+ * At this point we don't have explicit options set by the user, set
+ * them ourselves based on the state of the file system.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
-out:
- kfree(orig);
- return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating)
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+
+ /*
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
+ */
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1131,8 +949,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
}
static int btrfs_fill_super(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- void *data)
+ struct btrfs_fs_devices *fs_devices)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1148,10 +965,6 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
-#endif
- sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1160,13 +973,13 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- err = open_ctree(sb, fs_devices, (char *)data);
+ err = open_ctree(sb, fs_devices);
if (err) {
- btrfs_err(fs_info, "open_ctree failed");
+ btrfs_err(fs_info, "open_ctree failed: %d", err);
return err;
}
- inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
+ inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
btrfs_handle_fs_error(fs_info, err, NULL);
@@ -1200,7 +1013,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1282,6 +1095,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
print_rescue_option(seq, "ignorebadroots", &printed);
if (btrfs_test_opt(info, IGNOREDATACSUMS))
print_rescue_option(seq, "ignoredatacsums", &printed);
+ if (btrfs_test_opt(info, IGNOREMETACSUMS))
+ print_rescue_option(seq, "ignoremetacsums", &printed);
+ if (btrfs_test_opt(info, IGNORESUPERFLAGS))
+ print_rescue_option(seq, "ignoresuperflags", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1308,15 +1125,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
- seq_puts(seq, ",check_int_data");
- else if (btrfs_test_opt(info, CHECK_INTEGRITY))
- seq_puts(seq, ",check_int");
- if (info->check_integrity_print_mask)
- seq_printf(seq, ",check_int_print_mask=%d",
- info->check_integrity_print_mask);
-#endif
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
@@ -1331,34 +1139,16 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
- seq_printf(seq, ",subvolid=%llu",
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
}
-static int btrfs_test_super(struct super_block *s, void *data)
-{
- struct btrfs_fs_info *p = data;
- struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
- return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
- int err = set_anon_super(s, data);
- if (!err)
- s->s_fs_info = data;
- return err;
-}
-
/*
* subvolumes are identified by ino 256
*/
@@ -1402,7 +1192,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
struct super_block *s = root->d_sb;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
struct inode *root_inode = d_inode(root);
- u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+ u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root);
ret = 0;
if (!is_subvolume_inode(root_inode)) {
@@ -1434,195 +1224,6 @@ out:
return root;
}
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
- int flags, const char *device_name, void *data)
-{
- struct block_device *bdev = NULL;
- struct super_block *s;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_fs_info *fs_info = NULL;
- void *new_sec_opts = NULL;
- blk_mode_t mode = sb_open_mode(flags);
- int error = 0;
-
- if (data) {
- error = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (error)
- return ERR_PTR(error);
- }
-
- /*
- * Setup a dummy root and fs_info for test/set super. This is because
- * we don't actually fill this stuff out until open_ctree, but we need
- * then open_ctree will properly initialize the file system specific
- * settings later. btrfs_init_fs_info initializes the static elements
- * of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
- */
- fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
- if (!fs_info) {
- error = -ENOMEM;
- goto error_sec_opts;
- }
- btrfs_init_fs_info(fs_info);
-
- fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- if (!fs_info->super_copy || !fs_info->super_for_commit) {
- error = -ENOMEM;
- goto error_fs_info;
- }
-
- mutex_lock(&uuid_mutex);
- error = btrfs_parse_device_options(data, mode);
- if (error) {
- mutex_unlock(&uuid_mutex);
- goto error_fs_info;
- }
-
- device = btrfs_scan_one_device(device_name, mode);
- if (IS_ERR(device)) {
- mutex_unlock(&uuid_mutex);
- error = PTR_ERR(device);
- goto error_fs_info;
- }
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
- fs_info);
- if (IS_ERR(s)) {
- error = PTR_ERR(s);
- goto error_close_devices;
- }
-
- if (s->s_root) {
- btrfs_close_devices(fs_devices);
- btrfs_free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & SB_RDONLY)
- error = -EBUSY;
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
- s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
- error = btrfs_fill_super(s, fs_devices, data);
- }
- if (!error)
- error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
- security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- return dget(s->s_root);
-
-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
-error_sec_opts:
- security_free_mnt_opts(&new_sec_opts);
- return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/). This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- * 1. Parse subvol id related options for later use in mount_subvol().
- *
- * 2. Mount device's root (/) by calling vfs_kern_mount().
- *
- * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- * first place. In order to avoid calling btrfs_mount() again, we use
- * different file_system_type which is not registered to VFS by
- * register_filesystem() (btrfs_root_fs_type). As a result,
- * btrfs_mount_root() is called. The return value will be used by
- * mount_subtree() in mount_subvol().
- *
- * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
- * "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
-{
- struct vfsmount *mnt_root;
- struct dentry *root;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
- int error = 0;
-
- error = btrfs_parse_subvol_options(data, &subvol_name,
- &subvol_objectid);
- if (error) {
- kfree(subvol_name);
- return ERR_PTR(error);
- }
-
- /* mount device's root (/) */
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
- if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags & ~SB_RDONLY, device_name, data);
- } else {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags | SB_RDONLY, device_name, data);
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- down_write(&mnt_root->mnt_sb->s_umount);
- error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
- up_write(&mnt_root->mnt_sb->s_umount);
- if (error < 0) {
- root = ERR_PTR(error);
- mntput(mnt_root);
- kfree(subvol_name);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- /* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
- return root;
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
@@ -1645,7 +1246,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
- unsigned long old_opts, int flags)
+ unsigned long long old_opts, int flags)
{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -1659,7 +1260,7 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
}
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
- unsigned long old_opts)
+ unsigned long long old_opts)
{
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
@@ -1685,202 +1286,283 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- unsigned old_flags = sb->s_flags;
- unsigned long old_opts = fs_info->mount_opt;
- unsigned long old_compress_type = fs_info->compress_type;
- u64 old_max_inline = fs_info->max_inline;
- u32 old_thread_pool_size = fs_info->thread_pool_size;
- u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- sync_filesystem(sb);
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "remounting read-write after error is not allowed");
+ return -EINVAL;
+ }
- if (data) {
- void *new_sec_opts = NULL;
+ if (fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
- ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (!ret)
- ret = security_sb_remount(sb, new_sec_opts);
- security_free_mnt_opts(&new_sec_opts);
- if (ret)
- goto restore;
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ return -EACCES;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ return -EINVAL;
}
- ret = btrfs_parse_options(fs_info, data, *flags);
+ /*
+ * NOTE: when remounting with a change that does writes, don't put it
+ * anywhere above this point, as we are not sure to be safe to write
+ * until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
- goto restore;
+ return ret;
- ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
- if (ret < 0)
- goto restore;
+ btrfs_clear_sb_rdonly(fs_info->sb);
- btrfs_remount_begin(fs_info, old_opts, *flags);
- btrfs_resize_thread_pool(fs_info,
- fs_info->thread_pool_size, old_thread_pool_size);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
- btrfs_warn(fs_info,
- "remount supports changing free space tree only from ro to rw");
- /* Make sure free space cache options match the state on disk */
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- if (btrfs_free_space_cache_v1_active(fs_info)) {
- btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- }
+ /*
+ * If we've gone from readonly -> read-write, we need to get our
+ * sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- goto out;
+ return 0;
+}
- if (*flags & SB_RDONLY) {
- /*
- * this also happens on 'umount -rf' or on shutdown, when
- * the filesystem is busy.
- */
- cancel_work_sync(&fs_info->async_reclaim_work);
- cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * This also happens on 'umount -rf' or on shutdown, when the
+ * filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
- btrfs_discard_cleanup(fs_info);
+ btrfs_discard_cleanup(fs_info);
- /* wait for the uuid_scan task to finish */
- down(&fs_info->uuid_tree_rescan_sem);
- /* avoid complains from lockdep et al. */
- up(&fs_info->uuid_tree_rescan_sem);
+ /* Wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* Avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
- btrfs_set_sb_rdonly(sb);
+ btrfs_set_sb_rdonly(fs_info->sb);
- /*
- * Setting SB_RDONLY will put the cleaner thread to
- * sleep at the next loop if it's already active.
- * If it's already asleep, we'll leave unused block
- * groups on disk until we're mounted read-write again
- * unless we clean them up here.
- */
- btrfs_delete_unused_bgs(fs_info);
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+ * loop if it's already active. If it's already asleep, we'll leave
+ * unused block groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
- /*
- * The cleaner task could be already running before we set the
- * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
- * We must make sure that after we finish the remount, i.e. after
- * we call btrfs_commit_super(), the cleaner can no longer start
- * a transaction - either because it was dropping a dead root,
- * running delayed iputs or deleting an unused block group (the
- * cleaner picked a block group from the list of unused block
- * groups before we were able to in the previous call to
- * btrfs_delete_unused_bgs()).
- */
- wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * The cleaner task could be already running before we set the flag
+ * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make
+ * sure that after we finish the remount, i.e. after we call
+ * btrfs_commit_super(), the cleaner can no longer start a transaction
+ * - either because it was dropping a dead root, running delayed iputs
+ * or deleting an unused block group (the cleaner picked a block
+ * group from the list of unused block groups before we were able to
+ * in the previous call to btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
- /*
- * We've set the superblock to RO mode, so we might have made
- * the cleaner task sleep without running all pending delayed
- * iputs. Go through all the delayed iputs here, so that if an
- * unmount happens without remounting RW we don't end up at
- * finishing close_ctree() with a non-empty list of delayed
- * iputs.
- */
- btrfs_run_delayed_iputs(fs_info);
+ /*
+ * We've set the superblock to RO mode, so we might have made the
+ * cleaner task sleep without running all pending delayed iputs. Go
+ * through all the delayed iputs here, so that if an unmount happens
+ * without remounting RW we don't end up at finishing close_ctree()
+ * with a non-empty list of delayed iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
- btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(fs_info);
- btrfs_pause_balance(fs_info);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
- /*
- * Pause the qgroup rescan worker if it is running. We don't want
- * it to be still running after we are in RO mode, as after that,
- * by the time we unmount, it might have left a transaction open,
- * so we would leak the transaction and/or crash.
- */
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want it to
+ * be still running after we are in RO mode, as after that, by the time
+ * we unmount, it might have left a transaction open, so we would leak
+ * the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
- ret = btrfs_commit_super(fs_info);
- if (ret)
- goto restore;
- } else {
- if (BTRFS_FS_ERROR(fs_info)) {
- btrfs_err(fs_info,
- "Remounting read-write after error is not allowed");
- ret = -EINVAL;
- goto restore;
- }
- if (fs_info->fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto restore;
- }
+ return btrfs_commit_super(fs_info);
+}
- if (!btrfs_check_rw_degradable(fs_info, NULL)) {
- btrfs_warn(fs_info,
- "too many missing devices, writable remount is not allowed");
- ret = -EACCES;
- goto restore;
- }
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ fs_info->max_inline = ctx->max_inline;
+ fs_info->commit_interval = ctx->commit_interval;
+ fs_info->metadata_ratio = ctx->metadata_ratio;
+ fs_info->thread_pool_size = ctx->thread_pool_size;
+ fs_info->mount_opt = ctx->mount_opt;
+ fs_info->compress_type = ctx->compress_type;
+ fs_info->compress_level = ctx->compress_level;
+}
- if (btrfs_super_log_root(fs_info->super_copy) != 0) {
- btrfs_warn(fs_info,
- "mount required to replay tree-log, cannot remount read-write");
- ret = -EINVAL;
- goto restore;
- }
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ ctx->max_inline = fs_info->max_inline;
+ ctx->commit_interval = fs_info->commit_interval;
+ ctx->metadata_ratio = fs_info->metadata_ratio;
+ ctx->thread_pool_size = fs_info->thread_pool_size;
+ ctx->mount_opt = fs_info->mount_opt;
+ ctx->compress_type = fs_info->compress_type;
+ ctx->compress_level = fs_info->compress_level;
+}
- /*
- * NOTE: when remounting with a change that does writes, don't
- * put it anywhere above this point, as we are not sure to be
- * safe to write until we pass the above checks.
- */
- ret = btrfs_start_pre_rw_mount(fs_info);
- if (ret)
- goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old)
+{
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+ btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+ btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+ btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+ btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+ btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+ btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+ btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+ btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+ btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+ btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+ btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+ btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+ btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+ btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+ btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+ btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+ btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
+ btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+
+ btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+ btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+ btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+ btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+ btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+ btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+ btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+ /* Did the compression settings change? */
+ if (btrfs_test_opt(info, COMPRESS) &&
+ (!old ||
+ old->compress_type != info->compress_type ||
+ old->compress_level != info->compress_level ||
+ (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+ btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+ const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+ btrfs_info(info, "%s %s compression, level %d",
+ btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
+
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
- btrfs_clear_sb_rdonly(sb);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_context old_ctx;
+ int ret = 0;
+ bool mount_reconfigure = (fc->s_fs_info != NULL);
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ btrfs_info_to_ctx(fs_info, &old_ctx);
- /*
- * If we've gone from readonly -> read/write, we need to get
- * our sync/async discard lists in the right state.
- */
- btrfs_discard_resume(fs_info);
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ return -EINVAL;
+
+ ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+ if (ret < 0)
+ return ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+ btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+ old_ctx.thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from RO to RW");
+ /* Make sure free space cache options match the state on disk. */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
}
-out:
+
+ ret = 0;
+ if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_ro(fs_info);
+ else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_rw(fs_info);
+ if (ret)
+ goto restore;
+
/*
- * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
- * since the absence of the flag means it can be toggled off by remount.
+ * If we set the mask during the parameter parsing VFS would reject the
+ * remount. Here we can set the mask and the value will be updated
+ * appropriately.
*/
- *flags |= SB_I_VERSION;
+ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+ fc->sb_flags_mask |= SB_POSIXACL;
+ btrfs_emit_options(fs_info, &old_ctx);
wake_up_process(fs_info->transaction_kthread);
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
-
restore:
- /* We've hit an error - don't reset SB_RDONLY */
- if (sb_rdonly(sb))
- old_flags |= SB_RDONLY;
- if (!(old_flags & SB_RDONLY))
- clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
- sb->s_flags = old_flags;
- fs_info->mount_opt = old_opts;
- fs_info->compress_type = old_compress_type;
- fs_info->max_inline = old_max_inline;
- btrfs_resize_thread_pool(fs_info,
- old_thread_pool_size, fs_info->thread_pool_size);
- fs_info->metadata_ratio = old_metadata_ratio;
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_ctx_to_info(fs_info, &old_ctx);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
return ret;
}
@@ -2133,14 +1815,288 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
- buf->f_fsid.val[1] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid;
+ buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
+ buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
+
+ return 0;
+}
+
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct btrfs_fs_info *p = fc->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct block_device *bdev;
+ struct btrfs_device *device;
+ struct super_block *sb;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ int ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(fc->source, mode, true);
+ ASSERT(device != NULL);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(device);
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (ret)
+ return ret;
+
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto error;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+
+ /*
+ * From now on the error handling is not straightforward.
+ *
+ * If successful, this will transfer the fs_info into the super block,
+ * and fc->s_fs_info will be NULL. However if there's an existing
+ * super, we'll still have fc->s_fs_info populated. If we error
+ * completely out it'll be cleaned up when we drop the fs_context,
+ * otherwise it's tied to the lifetime of the super_block.
+ */
+ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto error;
+ }
+
+ set_device_specific_options(fs_info);
+
+ if (sb->s_root) {
+ btrfs_close_devices(fs_devices);
+ /*
+ * At this stage we may have RO flag mismatch between
+ * fc->sb_flags and sb->s_flags. Caller should detect such
+ * mismatch and reconfigure with sb->s_umount rwsem held if
+ * needed.
+ */
+ } else {
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+ btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+ ret = btrfs_fill_super(sb, fs_devices);
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ }
+
+ btrfs_clear_oneshot_options(fs_info);
+
+ fc->root = dget(sb->s_root);
+ return 0;
+
+error:
+ btrfs_close_devices(fs_devices);
+ return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ * (iii) # create rw superblock + initial mount
+ * mount -t xfs /dev/sdb /opt/
+ *
+ * # create ro bind mount
+ * mount --bind -o ro /opt/foo /mnt/foo
+ *
+ * # unmount initial mount
+ * umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro. The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ * @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ * made the superblock ro. Note, how SB_RDONLY has the same value as
+ * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ * fsmount() or mount_setattr() this is a pure VFS level change for a
+ * specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ * in fc->sb_flags.
+ *
+ * But, currently the util-linux mount command already utilizes the new mount
+ * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's
+ * btrfs or not, setting the whole super block RO. To make per-subvolume mounting
+ * work with different options work we need to keep backward compatibility.
+ */
+static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ if (fc->sb_flags & SB_RDONLY)
+ return ret;
+
+ down_write(&mnt->mnt_sb->s_umount);
+ if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY))
+ ret = btrfs_reconfigure(fc);
+ up_write(&mnt->mnt_sb->s_umount);
+ return ret;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_context *dup_fc;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+ int ret = 0;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ btrfs_free_fs_info(fs_info);
+ return -ENOMEM;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ dup_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(dup_fc)) {
+ btrfs_free_fs_info(fs_info);
+ return PTR_ERR(dup_fc);
+ }
+
+ /*
+ * When we do the sget_fc this gets transferred to the sb, so we only
+ * need to set it on the dup_fc as this is what creates the super block.
+ */
+ dup_fc->s_fs_info = fs_info;
+
+ /*
+ * We'll do the security settings in our btrfs_get_tree_super() mount
+ * loop, they were duplicated into dup_fc, we can drop the originals
+ * here.
+ */
+ security_free_mnt_opts(&fc->security);
+ fc->security = NULL;
+ mnt = fc_mount(dup_fc);
+ if (IS_ERR(mnt)) {
+ put_fs_context(dup_fc);
+ return PTR_ERR(mnt);
+ }
+ ret = btrfs_reconfigure_for_mount(dup_fc, mnt);
+ put_fs_context(dup_fc);
+ if (ret) {
+ mntput(mnt);
+ return ret;
+ }
+
+ /*
+ * This free's ->subvol_name, because if it isn't set we have to
+ * allocate a buffer to hold the subvol_name, so we just drop our
+ * reference to it here.
+ */
+ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+ ctx->subvol_name = NULL;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
return 0;
}
+static int btrfs_get_tree(struct fs_context *fc)
+{
+ /*
+ * Since we use mount_subtree to mount the default/specified subvol, we
+ * have to do mounts in two steps.
+ *
+ * First pass through we call btrfs_get_tree_subvol(), this is just a
+ * wrapper around fc_mount() to call back into here again, and this time
+ * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
+ * everything to open the devices and file system. Then we return back
+ * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+ * from there we can do our mount_subvol() call, which will lookup
+ * whichever subvol we're mounting and setup this fc with the
+ * appropriate dentry for the subvol.
+ */
+ if (fc->s_fs_info)
+ return btrfs_get_tree_super(fc);
+ return btrfs_get_tree_subvol(fc);
+}
+
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2148,22 +2104,85 @@ static void btrfs_kill_super(struct super_block *sb)
btrfs_free_fs_info(fs_info);
}
-static struct file_system_type btrfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+
+ if (fs_info)
+ btrfs_free_fs_info(fs_info);
+
+ if (ctx && refcount_dec_and_test(&ctx->refs)) {
+ kfree(ctx->subvol_name);
+ kfree(ctx);
+ }
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct btrfs_fs_context *ctx = src_fc->fs_private;
-static struct file_system_type btrfs_root_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount_root,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ /*
+ * Give a ref to our ctx to this dup, as we want to keep it around for
+ * our original fc so we can have the subvolume name or objectid.
+ *
+ * We unset ->source in the original fc because the dup needs it for
+ * mounting, and then once we free the dup it'll free ->source, so we
+ * need to make sure we're only pointing to it in one fc.
+ */
+ refcount_inc(&ctx->refs);
+ fc->fs_private = ctx;
+ fc->source = src_fc->source;
+ src_fc->source = NULL;
+ return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+ .parse_param = btrfs_parse_param,
+ .reconfigure = btrfs_reconfigure,
+ .get_tree = btrfs_get_tree,
+ .dup = btrfs_dup_fs_context,
+ .free = btrfs_free_fs_context,
};
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refs, 1);
+ fc->fs_private = ctx;
+ fc->ops = &btrfs_fs_context_ops;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+ } else {
+ ctx->thread_pool_size =
+ min_t(unsigned long, num_online_cpus() + 2, 8);
+ ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .init_fs_context = btrfs_init_fs_context,
+ .parameters = btrfs_fs_parameters,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2194,12 +2213,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
- vol->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol);
+ if (ret < 0)
+ goto out;
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2213,8 +2238,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
- if (IS_ERR(device)) {
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
ret = PTR_ERR(device);
break;
@@ -2228,15 +2257,14 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
}
+out:
kfree(vol);
return ret;
}
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
@@ -2245,20 +2273,14 @@ static int btrfs_freeze(struct super_block *sb)
* we want to avoid on a frozen filesystem), or do the commit
* ourselves.
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- /* no transaction, don't bother */
- if (PTR_ERR(trans) == -ENOENT)
- return 0;
- return PTR_ERR(trans);
- }
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static int check_dev_super(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_super_block *sb;
+ u64 last_trans;
u16 csum_type;
int ret = 0;
@@ -2294,10 +2316,10 @@ static int check_dev_super(struct btrfs_device *dev)
if (ret < 0)
goto out;
- if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (btrfs_super_generation(sb) != last_trans) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
- btrfs_super_generation(sb),
- fs_info->last_trans_committed);
+ btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
goto out;
}
@@ -2355,6 +2377,33 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
return 0;
}
+static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_count(fs_info, nr);
+
+ /*
+ * Only report the real number for DEBUG builds, as there are reports of
+ * serious performance degradation caused by too frequent shrinks.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return nr;
+ return 0;
+}
+
+static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_free_extent_maps(fs_info, nr_to_scan);
+
+ /* The extent map shrinker runs asynchronously, so always return 0. */
+ return 0;
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2366,9 +2415,10 @@ static const struct super_operations btrfs_super_ops = {
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
- .remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
+ .nr_cached_objects = btrfs_nr_cached_objects,
+ .free_cached_objects = btrfs_free_cached_objects,
};
static const struct file_operations btrfs_ctl_fops = {
@@ -2407,9 +2457,6 @@ static int __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- ", integrity-checker=on"
-#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
@@ -2459,6 +2506,9 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_init_cachep,
.exit_func = btrfs_destroy_cachep,
}, {
+ .init_func = btrfs_init_dio,
+ .exit_func = btrfs_destroy_dio,
+ }, {
.init_func = btrfs_transaction_init,
.exit_func = btrfs_transaction_exit,
}, {
@@ -2550,6 +2600,7 @@ static int __init init_btrfs_fs(void)
late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
+MODULE_DESCRIPTION("B-Tree File System (BTRFS)");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: xxhash64");
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..d80a86acfbbe 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,20 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "fs.h"
+
+struct super_block;
+struct btrfs_fs_info;
+
+bool btrfs_check_options(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt,
+ unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c9198723e4cb..5912d5057766 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
BTRFS_FEAT_ATTR_PTR(block_group_tree),
+ BTRFS_FEAT_ATTR_PTR(simple_quota),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+ BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -380,6 +385,8 @@ static const char *rescue_opts[] = {
"nologreplay",
"ignorebadroots",
"ignoredatacsums",
+ "ignoremetacsums",
+ "ignoresuperflags",
"all",
};
@@ -416,10 +423,17 @@ BTRFS_ATTR(static_feature, supported_sectorsizes,
static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf)
{
- return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
+ return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
}
BTRFS_ATTR(static_feature, acl, acl_show);
+static ssize_t temp_fsid_supported_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show);
+
/*
* Features which only depend on kernel version.
*
@@ -433,6 +447,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+ BTRFS_ATTR_PTR(static_feature, temp_fsid),
NULL
};
@@ -881,6 +896,9 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+SPACE_INFO_ATTR(reclaim_count);
+SPACE_INFO_ATTR(reclaim_bytes);
+SPACE_INFO_ATTR(reclaim_errors);
BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
@@ -889,8 +907,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
- return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+ spin_lock(&space_info->lock);
+ ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info));
+ spin_unlock(&space_info->lock);
+ return ret;
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -901,6 +923,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
int thresh;
int ret;
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return -EINVAL;
+
ret = kstrtoint(buf, 10, &thresh);
if (ret)
return ret;
@@ -917,6 +942,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
btrfs_sinfo_bg_reclaim_threshold_show,
btrfs_sinfo_bg_reclaim_threshold_store);
+static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int dynamic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &dynamic_reclaim);
+ if (ret)
+ return ret;
+
+ if (dynamic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, dynamic_reclaim,
+ btrfs_sinfo_dynamic_reclaim_show,
+ btrfs_sinfo_dynamic_reclaim_store);
+
+static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int periodic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &periodic_reclaim);
+ if (ret)
+ return ret;
+
+ if (periodic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, periodic_reclaim,
+ btrfs_sinfo_periodic_reclaim_show,
+ btrfs_sinfo_periodic_reclaim_store);
+
/*
* Allocation information about block group types.
*
@@ -934,8 +1025,13 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, dynamic_reclaim),
BTRFS_ATTR_PTR(space_info, chunk_size),
BTRFS_ATTR_PTR(space_info, size_classes),
+ BTRFS_ATTR_PTR(space_info, reclaim_count),
+ BTRFS_ATTR_PTR(space_info, reclaim_bytes),
+ BTRFS_ATTR_PTR(space_info, reclaim_errors),
+ BTRFS_ATTR_PTR(space_info, periodic_reclaim),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
#endif
@@ -1022,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -1032,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -1084,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -1196,21 +1292,31 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info));
}
BTRFS_ATTR(, generation, btrfs_generation_show);
+static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid);
+}
+BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
+
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy);
ssize_t ret = 0;
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (fs_devices->read_policy == i)
+ if (policy == i)
ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
@@ -1234,8 +1340,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
- if (i != fs_devices->read_policy) {
- fs_devices->read_policy = i;
+ if (i != READ_ONCE(fs_devices->read_policy)) {
+ WRITE_ONCE(fs_devices->read_policy, i);
btrfs_info(fs_devices->fs_info,
"read policy set to '%s'",
btrfs_read_policy_name[i]);
@@ -1284,6 +1390,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
+#ifdef CONFIG_BTRFS_DEBUG
+static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+ switch (READ_ONCE(fs_devices->offload_csum_mode)) {
+ case BTRFS_OFFLOAD_CSUM_AUTO:
+ return sysfs_emit(buf, "auto\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_ON:
+ return sysfs_emit(buf, "1\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_OFF:
+ return sysfs_emit(buf, "0\n");
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+}
+
+static ssize_t btrfs_offload_csum_store(struct kobject *kobj,
+ struct kobj_attribute *a, const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int ret;
+ bool val;
+
+ ret = kstrtobool(buf, &val);
+ if (ret == 0)
+ WRITE_ONCE(fs_devices->offload_csum_mode,
+ val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF);
+ else if (ret == -EINVAL && sysfs_streq(buf, "auto"))
+ WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO);
+ else
+ return -EINVAL;
+
+ return len;
+}
+BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store);
+#endif
+
/*
* Per-filesystem information and stats.
*
@@ -1302,6 +1449,10 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
+ BTRFS_ATTR_PTR(, temp_fsid),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(, offload_csum),
+#endif
NULL,
};
@@ -2090,6 +2241,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
}
BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ ssize_t ret = 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ ASSERT(btrfs_qgroup_enabled(fs_info));
+ switch (btrfs_qgroup_mode(fs_info)) {
+ case BTRFS_QGROUP_MODE_FULL:
+ ret = sysfs_emit(buf, "qgroup\n");
+ break;
+ case BTRFS_QGROUP_MODE_SIMPLE:
+ ret = sysfs_emit(buf, "squota\n");
+ break;
+ default:
+ btrfs_warn(fs_info, "unexpected qgroup mode %d\n",
+ btrfs_qgroup_mode(fs_info));
+ break;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+BTRFS_ATTR(qgroups, mode, qgroup_mode_show);
+
static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
struct kobj_attribute *a,
char *buf)
@@ -2152,6 +2330,7 @@ static struct attribute *qgroups_attrs[] = {
BTRFS_ATTR_PTR(qgroups, enabled),
BTRFS_ATTR_PTR(qgroups, inconsistent),
BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ BTRFS_ATTR_PTR(qgroups, mode),
NULL
};
ATTRIBUTE_GROUPS(qgroups);
@@ -2243,7 +2422,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
int ret;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
if (qgroup->kobj.state_initialized)
return 0;
@@ -2264,7 +2443,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup *next;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
rbtree_postorder_for_each_entry_safe(qgroup, next,
@@ -2285,7 +2464,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *next;
int ret = 0;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
ASSERT(fsid_kobj);
@@ -2317,7 +2496,7 @@ out:
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
if (qgroup->kobj.state_initialized) {
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 86c7eef12873..e6a284c59809 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -3,8 +3,17 @@
#ifndef BTRFS_SYSFS_H
#define BTRFS_SYSFS_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_block_group;
+struct btrfs_space_info;
+struct btrfs_qgroup;
+
enum btrfs_feature_set {
FEAT_COMPAT,
FEAT_COMPAT_RO,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..ce50847e1e01 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+ [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -60,10 +61,7 @@ struct inode *btrfs_new_test_inode(void)
return NULL;
inode->i_mode = S_IFREG;
- inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_FIRST_FREE_OBJECTID);
inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG);
return inode;
@@ -102,7 +100,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0);
+ extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -159,8 +157,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
if (!fs_info)
return;
- if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
+ if (WARN_ON(!btrfs_is_testing(fs_info)))
return;
test_mnt->mnt_sb->s_fs_info = NULL;
@@ -185,7 +182,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
TEST_ALLOC_INODE,
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
+ TEST_ALLOC_CHUNK_MAP,
};
extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index f4fd3fb7c887..de226209220f 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -14,9 +14,9 @@
#include "../disk-io.h"
#include "../btrfs_inode.h"
-#define PROCESS_UNLOCK (1 << 0)
-#define PROCESS_RELEASE (1 << 1)
-#define PROCESS_TEST_LOCKED (1 << 2)
+#define PROCESS_UNLOCK (1U << 0)
+#define PROCESS_RELEASE (1U << 1)
+#define PROCESS_TEST_LOCKED (1U << 2)
static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
@@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("should have found at least one delalloc");
@@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
}
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
@@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("didn't find our range");
@@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("didn't find our range");
@@ -672,7 +672,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < eb->len; i++) {
- struct page *page = eb->pages[i >> PAGE_SHIFT];
+ struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
void *addr = page_address(page) + offset_in_page(i);
if (memcmp(addr, memory + i, 1) != 0) {
@@ -688,7 +688,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
- void *eb_addr = page_address(eb->pages[i]);
+ void *eb_addr = folio_address(eb->folios[i]);
if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
dump_eb_and_memory_contents(eb, memory, test_name);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index bf85c75ee722..609bb6c9c087 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -11,23 +11,27 @@
#include "../disk-io.h"
#include "../block-group.h"
-static void free_extent_map_tree(struct extent_map_tree *em_tree)
+static int free_extent_map_tree(struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct rb_node *node;
+ int ret = 0;
write_lock(&em_tree->lock);
- while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
- node = rb_first_cached(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->root)) {
+ node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
+ ret = -EINVAL;
test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
- em->start, em->len, em->block_start,
- em->block_len, refcount_read(&em->refs));
+"em leak: em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu offset %llu) refs %d",
+ em->start, em->len, em->disk_bytenr,
+ em->disk_num_bytes, em->offset,
+ refcount_read(&em->refs));
refcount_set(&em->refs, 1);
}
@@ -35,6 +39,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
free_extent_map(em);
}
write_unlock(&em_tree->lock);
+
+ return ret;
}
/*
@@ -53,13 +59,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static int test_case_1(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 start = 0;
u64 len = SZ_8K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -70,10 +77,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -91,10 +99,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->start = SZ_16K;
em->len = SZ_4K;
- em->block_start = SZ_32K; /* avoid merging */
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_32K; /* avoid merging */
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -112,27 +121,35 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
/* Add [0, 8K), should return [0, 16K) instead. */
em->start = start;
em->len = len;
- em->block_start = start;
- em->block_len = len;
+ em->disk_bytenr = start;
+ em->disk_num_bytes = len;
+ em->ram_bytes = len;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K)) {
+ if (!em) {
+ test_err("case1 [%llu %llu]: no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -143,11 +160,12 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static int test_case_2(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -158,10 +176,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -179,10 +198,11 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -200,37 +220,45 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case2 [0 1K]: ret %d", ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ if (!em) {
+ test_err("case2 [0 1K]: no extent map returned");
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
- ret, em->start, em->len, em->block_start,
- em->block_len);
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
+ ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
static int __test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -241,10 +269,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [4K, 8K) */
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -262,32 +291,40 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case3 [0x%llx 0x%llx): ret %d",
+ test_err("case3 [%llu %llu): ret %d",
start, start + len, ret);
goto out;
}
+ if (!em) {
+ test_err("case3 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (em &&
- (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len)) {
+ if (start < em->start || start + len > extent_map_end(em) ||
+ em->start != extent_map_block_start(em)) {
test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -308,28 +345,29 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static int test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_3(fs_info, em_tree, 0);
+ ret = __test_case_3(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, SZ_8K);
+ ret = __test_case_3(fs_info, inode, SZ_8K);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+ ret = __test_case_3(fs_info, inode, (12 * SZ_1K));
return ret;
}
static int __test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -340,10 +378,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 8K) */
em->start = 0;
em->len = SZ_8K;
- em->block_start = 0;
- em->block_len = SZ_8K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_8K;
+ em->ram_bytes = SZ_8K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -361,10 +400,11 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [8K, 32K) */
em->start = SZ_8K;
em->len = 24 * SZ_1K;
- em->block_start = SZ_16K; /* avoid merging */
- em->block_len = 24 * SZ_1K;
+ em->disk_bytenr = SZ_16K; /* avoid merging */
+ em->disk_num_bytes = 24 * SZ_1K;
+ em->ram_bytes = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -381,26 +421,35 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 32K) */
em->start = 0;
em->len = SZ_32K;
- em->block_start = 0;
- em->block_len = SZ_32K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_32K;
+ em->ram_bytes = SZ_32K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case4 [0x%llx 0x%llx): ret %d",
- start, len, ret);
+ test_err("case4 [%llu %llu): ret %d",
+ start, start + len, ret);
+ goto out;
+ }
+ if (!em) {
+ test_err("case4 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
goto out;
}
- if (em && (start < em->start || start + len > extent_map_end(em))) {
+ if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
- start, len, ret, em->start, em->len, em->block_start,
- em->block_len);
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
+ start, start + len, ret, em->start, em->len,
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -430,22 +479,22 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static int test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_4(fs_info, em_tree, 0);
+ ret = __test_case_4(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_4(fs_info, em_tree, SZ_4K);
+ ret = __test_case_4(fs_info, inode, SZ_4K);
return ret;
}
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_inode *inode,
u64 start, u64 len, u64 block_start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
@@ -457,11 +506,12 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
em->start = start;
em->len = len;
- em->block_start = block_start;
- em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->disk_bytenr = block_start;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = len;
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -513,7 +563,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
struct rb_node *n;
int i;
- for (i = 0, n = rb_first_cached(&em_tree->map);
+ for (i = 0, n = rb_first(&em_tree->root);
valid_ranges[index][i].len && n;
i++, n = rb_next(n)) {
struct extent_map *entry = rb_entry(n, struct extent_map, rb_node);
@@ -567,53 +617,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
- struct inode *inode;
u64 start, end;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_map_range tests");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
/* [0, 12k) */
- ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -622,36 +663,39 @@ static int test_case_5(void)
/* Drop [8k, 12k) */
start = SZ_8K;
end = (3 * SZ_4K) - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 0);
if (ret)
goto out;
/* Drop [12k, 20k) */
start = SZ_4K * 3;
end = SZ_16K + SZ_4K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 1);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 1);
if (ret)
goto out;
/* Drop [28k, 32k) */
start = SZ_32K - SZ_4K;
end = SZ_32K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 2);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 2);
if (ret)
goto out;
/* Drop [32k, 64k) */
start = SZ_32K;
end = SZ_64K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 3);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 3);
if (ret)
goto out;
out:
- iput(inode);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -660,31 +704,35 @@ out:
* for areas between two existing ems. Validate it doesn't do this when there
* are two unmerged em's side by side.
*/
-static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree)
+static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em = NULL;
int ret;
+ int ret2;
- ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_16K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_16K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K);
+ ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K);
write_unlock(&em_tree->lock);
if (ret != 0) {
@@ -704,7 +752,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
ret = 0;
out:
free_extent_map(em);
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -713,38 +764,30 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
- struct inode *inode;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_cache with pinned");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
/* [0, 16K), pinned */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_16K;
+ em->flags |= (EXTENT_FLAG_PINNED | EXTENT_FLAG_COMPRESS_ZLIB);
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -762,10 +805,11 @@ static int test_case_7(void)
/* [32K, 48K), not pinned */
em->start = SZ_32K;
em->len = SZ_16K;
- em->block_start = SZ_32K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_32K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -777,7 +821,7 @@ static int test_case_7(void)
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
* [32K, 48K) extent.
*/
- btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true);
+ btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true);
/* Make sure our extent maps look sane. */
ret = -EINVAL;
@@ -826,8 +870,9 @@ static int test_case_7(void)
goto out;
}
- if (em->block_start != SZ_32K + SZ_4K) {
- test_err("em->block_start is %llu, expected 36K", em->block_start);
+ if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ test_err("em->block_start is %llu, expected 36K",
+ extent_map_block_start(em));
goto out;
}
@@ -844,7 +889,110 @@ static int test_case_7(void)
ret = 0;
out:
free_extent_map(em);
- iput(inode);
+ /* Unpin our extent to prevent warning when removing it below. */
+ ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ if (ret == 0)
+ ret = ret2;
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
+ return ret;
+}
+
+/*
+ * Test a regression for compressed extent map adjustment when we attempt to
+ * add an extent map that is partially overlapped by another existing extent
+ * map. The resulting extent map offset was left unchanged despite having
+ * incremented its start offset.
+ */
+static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ int ret;
+ int ret2;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ /* Compressed extent for the file range [120K, 128K). */
+ em->start = SZ_1K * 120;
+ em->len = SZ_8K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_8K;
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
+ write_unlock(&em_tree->lock);
+ free_extent_map(em);
+ if (ret < 0) {
+ test_err("couldn't add extent map for range [120K, 128K)");
+ goto out;
+ }
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Compressed extent for the file range [108K, 144K), which overlaps
+ * with the [120K, 128K) we previously inserted.
+ */
+ em->start = SZ_1K * 108;
+ em->len = SZ_1K * 36;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_1K * 36;
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+
+ /*
+ * Try to add the extent map but with a search range of [140K, 144K),
+ * this should succeed and adjust the extent map to the range
+ * [128K, 144K), with a length of 16K and an offset of 20K.
+ *
+ * This simulates a scenario where in the subvolume tree of an inode we
+ * have a compressed file extent item for the range [108K, 144K) and we
+ * have an overlapping compressed extent map for the range [120K, 128K),
+ * which was created by an encoded write, but its ordered extent was not
+ * yet completed, so the subvolume tree doesn't have yet the file extent
+ * item for that range - we only have the extent map in the inode's
+ * extent map tree.
+ */
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K);
+ write_unlock(&em_tree->lock);
+ free_extent_map(em);
+ if (ret < 0) {
+ test_err("couldn't add extent map for range [108K, 144K)");
+ goto out;
+ }
+
+ if (em->start != SZ_128K) {
+ test_err("unexpected extent map start %llu (should be 128K)", em->start);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (em->len != SZ_16K) {
+ test_err("unexpected extent map length %llu (should be 16K)", em->len);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (em->offset != SZ_1K * 20) {
+ test_err("unexpected extent map offset %llu (should be 20K)", em->offset);
+ ret = -EINVAL;
+ goto out;
+ }
+out:
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -864,33 +1012,21 @@ struct rmap_test_vector {
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
- struct extent_map *em;
- struct map_lookup *map = NULL;
+ struct btrfs_chunk_map *map;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
- em = alloc_extent_map();
- if (!em) {
- test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
- }
-
- map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
if (!map) {
- kfree(em);
- test_std_err(TEST_ALLOC_EXTENT_MAP);
+ test_std_err(TEST_ALLOC_CHUNK_MAP);
return -ENOMEM;
}
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
- em->start = SZ_4G;
- em->len = test->data_stripe_size * test->num_data_stripes;
- em->block_len = em->len;
- em->orig_block_len = test->data_stripe_size;
- em->map_lookup = map;
-
+ map->start = SZ_4G;
+ map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+ map->stripe_size = test->data_stripe_size;
map->num_stripes = test->num_stripes;
map->type = test->raid_type;
@@ -906,15 +1042,14 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
- write_lock(&fs_info->mapping_tree.lock);
- ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
- write_unlock(&fs_info->mapping_tree.lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
- test_err("error adding block group mapping to mapping tree");
+ test_err("error adding chunk map to mapping tree");
+ btrfs_free_chunk_map(map);
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -943,14 +1078,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
- write_lock(&fs_info->mapping_tree.lock);
- remove_extent_mapping(&fs_info->mapping_tree, em);
- write_unlock(&fs_info->mapping_tree.lock);
- /* For us */
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
out_free:
- /* For the tree */
- free_extent_map(em);
kfree(logical);
return ret;
}
@@ -958,7 +1087,8 @@ out_free:
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
- struct extent_map_tree *em_tree;
+ struct inode *inode;
+ struct btrfs_root *root = NULL;
int ret = 0, i;
struct rmap_test_vector rmap_tests[] = {
{
@@ -1007,33 +1137,45 @@ int btrfs_test_extent_map(void)
return -ENOMEM;
}
- em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
- if (!em_tree) {
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
ret = -ENOMEM;
goto out;
}
- extent_map_tree_init(em_tree);
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ root = NULL;
+ goto out;
+ }
+
+ BTRFS_I(inode)->root = root;
- ret = test_case_1(fs_info, em_tree);
+ ret = test_case_1(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_2(fs_info, em_tree);
+ ret = test_case_2(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_3(fs_info, em_tree);
+ ret = test_case_3(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_4(fs_info, em_tree);
+ ret = test_case_4(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_5();
+ ret = test_case_5(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_6(fs_info, em_tree);
+ ret = test_case_6(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_7();
+ ret = test_case_7(fs_info, BTRFS_I(inode));
+ if (ret)
+ goto out;
+ ret = test_case_8(fs_info, BTRFS_I(inode));
if (ret)
goto out;
@@ -1045,7 +1187,8 @@ int btrfs_test_extent_map(void)
}
out:
- kfree(em_tree);
+ iput(inode);
+ btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
return ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..3ea3bc2225fe 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -117,7 +117,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
/* Now for a regular extent */
insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
- disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
disk_bytenr += sectorsize;
offset += sectorsize - 1;
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
@@ -258,14 +258,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
free_extent_map(em);
@@ -278,13 +278,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
*/
setup_file_extents(root, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_INLINE) {
- test_err("expected an inline, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_INLINE) {
+ test_err("expected an inline, got %llu", em->disk_bytenr);
goto out;
}
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
/*
@@ -316,13 +316,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 4) {
@@ -332,20 +332,20 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize - 1) {
@@ -355,25 +355,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -383,26 +382,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -412,19 +410,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -434,31 +432,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -468,26 +466,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -497,27 +494,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_HOLE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_HOLE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -527,30 +523,29 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("unexpected orig offset, wanted %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("unexpected offset, wanted %llu, have %llu",
+ em->start - orig_start, em->offset);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -560,32 +555,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu", orig_start,
- em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -595,31 +589,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -629,32 +622,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -664,25 +656,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -692,31 +683,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -726,24 +717,23 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole extent, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole extent, got %llu", em->disk_bytenr);
goto out;
}
/*
@@ -758,25 +748,24 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
vacancy_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -786,12 +775,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong orig offset, want 0, have %llu", em->offset);
goto out;
}
ret = 0;
@@ -850,13 +838,13 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != 0 || em->len != sectorsize) {
@@ -866,19 +854,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ test_err("wrong flags, wanted %u, have %u", vacancy_only,
em->flags);
goto out;
}
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != sectorsize) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu", extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
@@ -888,7 +876,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, wanted 0 got %lu",
+ test_err("unexpected flags set, wanted 0 got %u",
em->flags);
goto out;
}
@@ -1095,8 +1083,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
test_msg("running inode tests");
- set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+ compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+ prealloc_only |= EXTENT_FLAG_PREALLOC;
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0548072c642f..dbef80cd5a9f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,12 +23,10 @@
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
-#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
@@ -145,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
- WARN_ON(!RB_EMPTY_ROOT(
- &transaction->delayed_refs.dirty_extent_root));
+ WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -164,7 +161,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the deleted_bgs list during a transaction abort.
+ */
+ spin_lock(&transaction->fs_info->unused_bgs_lock);
list_del_init(&cache->bg_list);
+ spin_unlock(&transaction->fs_info->unused_bgs_lock);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
@@ -278,8 +281,10 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
+ const int abort_error = cur_trans->aborted;
+
spin_unlock(&fs_info->trans_lock);
- return cur_trans->aborted;
+ return abort_error;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
@@ -353,7 +358,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
- cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
+ xa_init(&cur_trans->delayed_refs.dirty_extents);
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
@@ -384,7 +389,7 @@ loop:
IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS);
- fs_info->generation++;
+ btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
@@ -407,7 +412,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- root->last_trans < trans->transid) || force) {
+ btrfs_get_root_last_trans(root) < trans->transid) || force) {
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -423,15 +428,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid && !force) {
+ if (btrfs_get_root_last_trans(root) == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
radix_tree_tag_set(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
- root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(root, trans->transid);
/* this is pretty tricky. We don't want to
* take the relocation lock in btrfs_record_root_in_trans
@@ -474,7 +479,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
/* Make sure we don't try to update the root at commit time */
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
}
@@ -493,7 +498,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
* and barriers
*/
smp_rmb();
- if (root->last_trans == trans->transid &&
+ if (btrfs_get_root_last_trans(root) == trans->transid &&
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
@@ -552,13 +557,42 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
if (!fs_info->reloc_ctl ||
!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
return true;
}
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush,
+ u64 num_bytes,
+ u64 *delayed_refs_bytes)
+{
+ struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
+ int ret;
+
+ /*
+ * We want to reserve all the bytes we may need all at once, so we only
+ * do 1 enospc flushing cycle per transaction start.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+
+ /*
+ * If we are an emergency flush, which can steal from the global block
+ * reserve, then attempt to not reserve space for the delayed refs, as
+ * we will consume space for them from the global block reserve.
+ */
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ bytes -= *delayed_refs_bytes;
+ *delayed_refs_bytes = 0;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ }
+
+ return ret;
+}
+
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -566,10 +600,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
+ u64 delayed_refs_bytes = 0;
bool reloc_reserved = false;
bool do_chunk_alloc = false;
int ret;
@@ -592,9 +628,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
- struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
- u64 delayed_refs_bytes = 0;
-
qgroup_reserved = num_items * fs_info->nodesize;
/*
* Use prealloc for now, as there might be a currently running
@@ -606,20 +639,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (ret)
return ERR_PTR(ret);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
/*
- * We want to reserve all the bytes we may need all at once, so
- * we only do 1 enospc flushing cycle per transaction start. We
- * accomplish this by simply assuming we'll do num_items worth
- * of delayed refs updates in this trans handle, and refill that
- * amount for whatever is missing in the reserve.
+ * If we plan to insert/update/delete "num_items" from a btree,
+ * we will also generate delayed refs for extent buffers in the
+ * respective btree paths, so reserve space for the delayed refs
+ * that will be generated by the caller as it modifies btrees.
+ * Try to reserve them to avoid excessive use of the global
+ * block reserve.
*/
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- num_items);
- num_bytes += delayed_refs_bytes;
- }
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
/*
* Do the reservation for the relocation root creation
@@ -629,16 +658,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+ ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+ &delayed_refs_bytes);
if (ret)
goto reserve_fail;
- if (delayed_refs_bytes) {
- btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
- num_bytes -= delayed_refs_bytes;
- }
- btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
- if (rsv->space_info->force_alloc)
+ btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+ if (trans_rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
!btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -698,6 +725,7 @@ again:
h->type = type;
INIT_LIST_HEAD(&h->new_bgs);
+ btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -710,8 +738,17 @@ again:
if (num_bytes) {
trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
- h->block_rsv = &fs_info->trans_block_rsv;
+ h->block_rsv = trans_rsv;
h->bytes_reserved = num_bytes;
+ if (delayed_refs_bytes > 0) {
+ trace_btrfs_space_reservation(fs_info,
+ "local_delayed_refs_rsv",
+ h->transid,
+ delayed_refs_bytes, 1);
+ h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+ btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+ delayed_refs_bytes = 0;
+ }
h->reloc_reserved = reloc_reserved;
}
@@ -766,8 +803,10 @@ join_fail:
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes)
- btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes, NULL);
+ btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+ if (delayed_refs_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+ delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -814,7 +853,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
*
* It is used when we want to commit the current the transaction, but
* don't want to start a new one.
@@ -833,7 +872,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
*
* It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
@@ -909,7 +948,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
int ret = 0;
if (transid) {
- if (transid <= fs_info->last_trans_committed)
+ if (transid <= btrfs_get_last_trans_committed(fs_info))
goto out;
/* find specified transaction */
@@ -933,7 +972,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
- if (transid > fs_info->last_trans_committed)
+ if (transid > btrfs_get_last_trans_committed(fs_info))
ret = -EINVAL;
goto out;
}
@@ -988,11 +1027,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
if (!trans->block_rsv) {
ASSERT(!trans->bytes_reserved);
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
}
- if (!trans->bytes_reserved)
+ if (!trans->bytes_reserved) {
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
+ }
ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
@@ -1000,6 +1042,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
+
+ if (!trans->delayed_refs_bytes_reserved)
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+ trans->transid,
+ trans->delayed_refs_bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+ trans->delayed_refs_bytes_reserved, NULL);
+ trans->delayed_refs_bytes_reserved = 0;
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1007,7 +1059,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int err = 0;
+ int ret = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -1046,13 +1098,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
- err = trans->aborted;
+ ret = trans->aborted;
else
- err = -EROFS;
+ ret = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- return err;
+ return ret;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
@@ -1073,8 +1125,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
- int err = 0;
- int werr = 0;
+ int ret = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
@@ -1084,7 +1135,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
mark, &cached_state)) {
bool wait_writeback = false;
- err = convert_extent_bit(dirty_pages, start, end,
+ ret = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
mark, &cached_state);
/*
@@ -1100,22 +1151,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
* We cleanup any entries left in the io tree when committing
* the transaction (through extent_io_tree_release()).
*/
- if (err == -ENOMEM) {
- err = 0;
+ if (ret == -ENOMEM) {
+ ret = 0;
wait_writeback = true;
}
- if (!err)
- err = filemap_fdatawrite_range(mapping, start, end);
- if (err)
- werr = err;
- else if (wait_writeback)
- werr = filemap_fdatawait_range(mapping, start, end);
+ if (!ret)
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && wait_writeback)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- return werr;
+ return ret;
}
/*
@@ -1127,12 +1178,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- int err = 0;
- int werr = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ int ret = 0;
while (find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -1144,22 +1194,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- err = clear_extent_bit(dirty_pages, start, end,
+ ret = clear_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT, &cached_state);
- if (err == -ENOMEM)
- err = 0;
- if (!err)
- err = filemap_fdatawait_range(mapping, start, end);
- if (err)
- werr = err;
+ if (ret == -ENOMEM)
+ ret = 0;
+ if (!ret)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
- return werr;
+ return ret;
}
static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
@@ -1184,7 +1232,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
bool errors = false;
int err;
- ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if ((mark & EXTENT_DIRTY) &&
@@ -1331,7 +1379,7 @@ again:
}
/* Now flush any delayed refs generated by updating all of the roots */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
@@ -1346,7 +1394,7 @@ again:
* so we want to keep this flushing in this loop to make sure
* everything gets run.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
}
@@ -1447,7 +1495,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
ASSERT(atomic_read(&root->log_commit[1]) == 0);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
btrfs_qgroup_free_meta_all_pertrans(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1481,45 +1529,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
}
/*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_trans_handle *trans;
- int ret;
-
- if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
- return 0;
-
- while (1) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
-
- ret = btrfs_defrag_leaves(trans, root);
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(info);
- cond_resched();
-
- if (btrfs_fs_closing(info) || ret != -EAGAIN)
- break;
-
- if (btrfs_defrag_cancelled(info)) {
- btrfs_debug(info, "defrag_root cancelled");
- ret = -EAGAIN;
- break;
- }
- }
- clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
- return ret;
-}
-
-/*
* Do all special snapshot related qgroup dirty hack.
*
* Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1536,11 +1545,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
int ret;
/*
- * Save some performance in the case that qgroups are not
- * enabled. If this check races with the ioctl, rescan will
- * kick in anyway.
+ * Save some performance in the case that qgroups are not enabled. If
+ * this check races with the ioctl, rescan will kick in anyway.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
@@ -1564,7 +1572,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* for now flush the delayed refs to narrow the race window where the
* qgroup counters could end up wrong.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1578,8 +1586,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
goto out;
/* Now qgroup are all updated, we can inherit it to new qgroups */
- ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- inherit);
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid,
+ btrfs_root_id(parent), inherit);
if (ret < 0)
goto out;
@@ -1636,7 +1644,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = pending->dir;
+ struct inode *parent_inode = &pending->dir->vfs_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1729,6 +1737,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
+ ret = btrfs_create_qgroup(trans, objectid);
+ if (ret && ret != -EEXIST) {
+ if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ }
+
/*
* pull in the delayed directory update
* and the delayed inode item
@@ -1811,7 +1827,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert root back/forward references
*/
ret = btrfs_add_root_ref(trans, objectid,
- parent_root->root_key.objectid,
+ btrfs_root_id(parent_root),
btrfs_ino(BTRFS_I(parent_inode)), index,
&fname.disk_name);
if (ret) {
@@ -1840,16 +1856,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* To co-operate with that hack, we do hack again.
* Or snapshot will be greatly slowed down by a subtree qgroup rescan
*/
- ret = qgroup_account_snapshot(trans, root, parent_root,
- pending->inherit, objectid);
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
+ btrfs_root_id(parent_root), pending->inherit);
if (ret < 0)
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
index);
- /* We have check then name at the beginning, so it is impossible. */
- BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1857,8 +1875,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
fname.disk_name.len * 2);
- parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(parent_inode,
+ inode_set_ctime_current(parent_inode));
+ ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1940,19 +1959,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->uuid_tree_generation = root_item->generation;
}
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
-{
- struct btrfs_transaction *trans;
- int ret = 0;
-
- spin_lock(&info->trans_lock);
- trans = info->running_transaction;
- if (trans)
- ret = (trans->state >= TRANS_STATE_COMMIT_START);
- spin_unlock(&info->trans_lock);
- return ret;
-}
-
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
struct btrfs_transaction *trans;
@@ -1992,6 +1998,25 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
btrfs_put_transaction(cur_trans);
}
+/*
+ * If there is a running transaction commit it or if it's already committing,
+ * wait for its commit to complete. Does not start and commit a new transaction
+ * if there isn't any running.
+ */
+int btrfs_commit_current_transaction(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ int ret = PTR_ERR(trans);
+
+ return (ret == -ENOENT) ? 0 : ret;
+ }
+
+ return btrfs_commit_transaction(trans);
+}
+
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2081,8 +2106,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the new_bgs list during a transaction abort.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
}
}
@@ -2113,7 +2144,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
/*
@@ -2400,7 +2431,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
goto unlock_reloc;
@@ -2533,7 +2564,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
- fs_info->last_trans_committed = cur_trans->transid;
+ btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2622,7 +2653,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root));
btrfs_kill_all_delayed_nodes(root);
@@ -2651,25 +2682,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
*/
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit)
+ unsigned int line, int error, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
+ WRITE_ONCE(trans->aborted, error);
+ WRITE_ONCE(trans->transaction->aborted, error);
+ if (first_hit && error == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+ __btrfs_handle_fs_error(fs_info, function, line, error, NULL);
}
int __init btrfs_transaction_init(void)
{
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
- sizeof(struct btrfs_trans_handle), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
if (!btrfs_trans_handle_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7623db359881..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -6,12 +6,27 @@
#ifndef BTRFS_TRANSACTION_H
#define BTRFS_TRANSACTION_H
+#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/time64.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
-#include "ctree.h"
+#include "extent-io-tree.h"
+#include "block-rsv.h"
+#include "messages.h"
#include "misc.h"
+struct dentry;
+struct inode;
+struct btrfs_pending_snapshot;
+struct btrfs_fs_info;
+struct btrfs_root_item;
+struct btrfs_root;
+struct btrfs_path;
+
/*
* Signal that a direct IO write is in progress, to avoid deadlock for sync
* direct IO writes when fsync is called during the direct IO write path.
@@ -127,8 +142,10 @@ enum {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 delayed_refs_bytes_reserved;
u64 chunk_bytes_reserved;
unsigned long delayed_ref_updates;
+ unsigned long delayed_ref_csum_deletions;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
@@ -148,6 +165,7 @@ struct btrfs_trans_handle {
bool in_fsync;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
+ struct btrfs_block_rsv delayed_rsv;
};
/*
@@ -160,7 +178,7 @@ struct btrfs_trans_handle {
struct btrfs_pending_snapshot {
struct dentry *dentry;
- struct inode *dir;
+ struct btrfs_inode *dir;
struct btrfs_root *root;
struct btrfs_root_item *root_item;
struct btrfs_root *snap;
@@ -181,7 +199,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
{
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
- inode->last_sub_trans = inode->root->log_transid;
+ inode->last_sub_trans = btrfs_get_root_log_transid(inode->root);
inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}
@@ -209,32 +227,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-bool __cold abort_should_print_stack(int errno);
+bool __cold abort_should_print_stack(int error);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact stack trace is reported for some errors.
*/
-#define btrfs_abort_transaction(trans, errno) \
+#define btrfs_abort_transaction(trans, error) \
do { \
- bool first = false; \
+ bool __first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- first = true; \
- if (WARN(abort_should_print_stack(errno), \
+ __first = true; \
+ if (WARN(abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno))) { \
+ (error))) { \
/* Stack trace printed. */ \
} else { \
btrfs_err((trans)->fs_info, \
"Transaction aborted (error %d)", \
- (errno)); \
+ (error)); \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno), first); \
+ __LINE__, (error), __first); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -252,11 +270,11 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+int btrfs_commit_current_transaction(struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
@@ -266,14 +284,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit);
+ unsigned int line, int error, bool first_hit);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 53c74010140e..ffa5b83d3a4a 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -21,7 +21,6 @@
#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
-#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
#include "misc.h"
@@ -29,6 +28,7 @@
#include "accessors.h"
#include "file-item.h"
#include "inode-item.h"
+#include "dir-item.h"
#include "extent-tree.h"
/*
@@ -65,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -92,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -152,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -337,6 +340,24 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
}
+ /*
+ * For non-compressed data extents, ram_bytes should match its
+ * disk_num_bytes.
+ * However we do not really utilize ram_bytes in this case, so this check
+ * is only optional for DEBUG builds for developers to catch the
+ * unexpected behaviors.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) &&
+ btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE &&
+ btrfs_file_extent_disk_bytenr(leaf, fi)) {
+ if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) !=
+ btrfs_file_extent_disk_num_bytes(leaf, fi)))
+ file_extent_err(leaf, slot,
+"mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent",
+ btrfs_file_extent_ram_bytes(leaf, fi),
+ btrfs_file_extent_disk_num_bytes(leaf, fi));
+ }
+
return 0;
}
@@ -614,7 +635,7 @@ static int check_dir_item(struct extent_buffer *leaf,
*/
if (key->type == BTRFS_DIR_ITEM_KEY ||
key->type == BTRFS_XATTR_ITEM_KEY) {
- char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+ char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
read_extent_buffer(leaf, namebuf,
(unsigned long)(di + 1), name_len);
@@ -648,6 +669,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1004,6 +1026,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1259,6 +1282,7 @@ static void extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1527,6 +1551,9 @@ static int check_extent_item(struct extent_buffer *leaf,
}
inline_refs += btrfs_shared_data_ref_count(leaf, sref);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ break;
default:
extent_err(leaf, slot, "unknown inline ref type: %u",
inline_type);
@@ -1743,6 +1770,25 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+ const struct btrfs_key *key, int slot)
+{
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
+ generic_err(leaf, slot,
+ "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
static int check_dev_extent_item(const struct extent_buffer *leaf,
const struct btrfs_key *key,
int slot,
@@ -1866,6 +1912,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
ret = check_extent_data_ref(leaf, key, slot);
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ ret = check_raid_stripe_extent(leaf, key, slot);
+ break;
}
if (ret)
@@ -1889,6 +1938,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_LEVEL;
}
+ if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -1950,6 +2004,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
+ enum btrfs_tree_block_status ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -2005,21 +2060,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
- /*
- * We only want to do this if WRITTEN is set, otherwise the leaf
- * may be in some intermediate state and won't appear valid.
- */
- if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) {
- enum btrfs_tree_block_status ret;
-
- /*
- * Check if the item size and content meet other
- * criteria
- */
- ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
- return ret;
- }
+ /* Check if the item size and content meet other criteria. */
+ ret = check_leaf_item(leaf, &key, slot, &prev_key);
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
+ return ret;
prev_key.objectid = key.objectid;
prev_key.type = key.type;
@@ -2049,6 +2093,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
int level = btrfs_header_level(node);
u64 bytenr;
+ if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(node, 0, "invalid flag for node, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
@@ -2113,7 +2162,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* Skip dummy fs, as selftests don't create unique ebs for each dummy
* root.
*/
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ if (btrfs_is_testing(eb->fs_info))
return 0;
/*
* There are several call sites (backref walking, qgroup, and data
@@ -2186,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* So we only checks tree blocks which is read from disk, whose
* generation <= fs_info->last_trans_committed.
*/
- if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
return 0;
/* We have @first_key, so this @eb must have at least one item */
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..01669cfa6578 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,10 +6,12 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
+#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
struct btrfs_chunk;
+struct btrfs_key;
/* All the extra info needed to verify the parentness of a tree block. */
struct btrfs_tree_parent_check {
@@ -22,7 +24,7 @@ struct btrfs_tree_parent_check {
/*
* Expected transid, can be 0 to skip the check, but such skip
- * should only be utlized for backref walk related code.
+ * should only be utilized for backref walk related code.
*/
u64 transid;
@@ -51,6 +53,7 @@ enum btrfs_tree_block_status {
BTRFS_TREE_BLOCK_INVALID_BLOCKPTR,
BTRFS_TREE_BLOCK_INVALID_ITEM,
BTRFS_TREE_BLOCK_INVALID_OWNER,
+ BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
};
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cc9a2f8a4ae3..31adea5b0b96 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -13,13 +13,11 @@
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
-#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
@@ -140,11 +138,14 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
struct inode *inode;
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(is_fstree(btrfs_root_id(root)));
+
/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
@@ -153,10 +154,13 @@ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
* attempt a transaction commit, resulting in a deadlock.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
memalloc_nofs_restore(nofs_flag);
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return BTRFS_I(inode);
}
/*
@@ -320,8 +324,7 @@ struct walk_control {
/*
* Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
*/
bool ignore_cur_inode;
@@ -366,8 +369,7 @@ static int process_one_buffer(struct btrfs_root *log,
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
- eb->len);
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
if (ret)
return ret;
@@ -413,7 +415,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
@@ -613,21 +615,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- inode = NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -653,7 +640,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;
@@ -677,23 +664,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ return 0;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
- }
+ inode = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
@@ -727,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
@@ -770,7 +753,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
if (ins.objectid > 0) {
- struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
@@ -784,12 +766,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (ret < 0) {
goto out;
} else if (ret == 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key->objectid, offset, 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key->objectid, offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -799,7 +784,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- root->root_key.objectid,
+ btrfs_root_id(root),
key->objectid, offset, &ins);
if (ret)
goto out;
@@ -818,9 +803,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
/*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
@@ -902,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -948,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
@@ -963,9 +948,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -973,10 +959,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1087,7 +1074,9 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
@@ -1149,7 +1138,7 @@ again:
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
- struct inode *victim_parent;
+ struct btrfs_inode *victim_parent;
leaf = path->nodes[0];
@@ -1160,13 +1149,13 @@ again:
struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
+ victim_name.len, &victim_name);
if (ret)
return ret;
@@ -1181,18 +1170,18 @@ again:
kfree(victim_name.name);
return ret;
} else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
+ victim_parent = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(victim_parent)) {
+ ret = PTR_ERR(victim_parent);
+ } else {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
+ victim_parent,
inode, &victim_name);
+ iput(&victim_parent->vfs_inode);
}
- iput(victim_parent);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1326,19 +1315,18 @@ again:
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1370,8 +1358,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
@@ -1404,15 +1392,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -1420,24 +1412,44 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (log_ref_ver) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret)
+ goto out;
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ }
+ goto out;
+ }
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ if (ret)
+ goto out;
}
- if (ret)
- goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1448,8 +1460,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
+ ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
@@ -1459,22 +1470,22 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
- iput(dir);
+ if (log_ref_ver && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1487,8 +1498,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
goto out;
@@ -1497,13 +1507,14 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
-static int count_inode_extrefs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
@@ -1517,8 +1528,8 @@ static int count_inode_extrefs(struct btrfs_root *root,
struct extent_buffer *leaf;
while (1) {
- ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- &extref, &offset);
+ ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+ path, &extref, &offset);
if (ret)
break;
@@ -1546,8 +1557,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
return nlink;
}
-static int count_inode_refs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
@@ -1562,7 +1572,7 @@ static int count_inode_refs(struct btrfs_root *root,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
@@ -1614,9 +1624,9 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct inode *inode)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
@@ -1626,13 +1636,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = count_inode_refs(root, BTRFS_I(inode), path);
+ ret = count_inode_refs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+ ret = count_inode_extrefs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
@@ -1642,11 +1652,12 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
goto out;
}
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
@@ -1671,12 +1682,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
@@ -1698,14 +1710,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- ret = fixup_inode_link_count(trans, root, inode);
- iput(inode);
+ ret = fixup_inode_link_count(trans, &inode->vfs_inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
@@ -1733,12 +1745,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
@@ -1747,15 +1761,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
} else if (ret == -EEXIST) {
ret = 0;
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1771,27 +1785,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1853,16 +1866,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
@@ -1883,9 +1896,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1900,9 +1912,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1957,11 +1968,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
@@ -2118,16 +2129,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2164,9 +2175,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -2174,9 +2186,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2186,7 +2197,8 @@ out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -2310,7 +2322,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
- struct inode *dir;
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
@@ -2318,14 +2330,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path)
return -ENOMEM;
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
+ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
- return 0;
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
}
range_start = 0;
@@ -2387,7 +2402,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
- iput(dir);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2431,23 +2446,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &key, i);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2455,8 +2477,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
@@ -2481,30 +2509,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
+ ret = btrfs_drop_extents(wc->trans, root, inode,
&drop_args);
if (!ret) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- root, BTRFS_I(inode));
+ ret = btrfs_update_inode(wc->trans, inode);
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
@@ -2539,9 +2565,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
- if (ret && ret != -ENOENT)
+ if (ret)
break;
- ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
@@ -2562,14 +2587,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
+ return -ENOENT;
}
spin_lock(&cache->space_info->lock);
@@ -2580,28 +2605,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
+
+ return 0;
}
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
-
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
wait_on_extent_buffer_writeback(eb);
btrfs_tree_unlock(eb);
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
- if (ret)
- return ret;
- btrfs_redirty_list_add(trans->transaction, eb);
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
- }
+ if (trans)
+ return btrfs_pin_reserved_extent(trans, eb);
- return 0;
+ return unaccount_log_buffer(eb->fs_info, eb->start);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -2842,6 +2861,52 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+ ctx->scratch_eb = NULL;
+}
+
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = ctx->inode;
+
+ if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return;
+
+ /*
+ * Don't care about allocation failure. This is just for optimization,
+ * if we fail to allocate here, we will try again later if needed.
+ */
+ ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
+}
+
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ btrfs_assert_inode_locked(ctx->inode);
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
@@ -2867,10 +2932,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
}
/*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it. When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
@@ -2980,7 +3044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
- root->log_transid++;
+ btrfs_set_root_log_transid(root, root->log_transid + 1);
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
@@ -3018,15 +3082,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
- if (!list_empty(&root_log_ctx.list))
- list_del_init(&root_log_ctx.list);
-
+ list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC)
btrfs_err(fs_info,
"failed to update log for root %llu ret %d",
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
@@ -3040,7 +3102,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3155,8 +3216,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
- ASSERT(root->last_log_commit <= log_transid);
- root->last_log_commit = log_transid;
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+ btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3230,8 +3291,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ extent_io_tree_release(&log->dirty_log_pages);
extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
@@ -3646,6 +3706,30 @@ out:
return ret;
}
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+ const int slot = path->slots[0];
+
+ if (ctx->scratch_eb) {
+ copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+ } else {
+ ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!ctx->scratch_eb)
+ return -ENOMEM;
+ }
+
+ btrfs_release_path(path);
+ path->nodes[0] = ctx->scratch_eb;
+ path->slots[0] = slot;
+ /*
+ * Add extra ref to scratch eb so that it is not freed when callers
+ * release the path, so we can reuse it later if needed.
+ */
+ atomic_inc(&ctx->scratch_eb->refs);
+
+ return 0;
+}
+
static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
@@ -3660,23 +3744,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
- int i;
+ int ret;
/*
* We need to clone the leaf, release the read lock on it, and use the
* clone before modifying the log tree. See the comment at copy_items()
* about why we need to do this.
*/
- src = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(path, ctx);
+ if (ret < 0)
+ return ret;
- i = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = src;
- path->slots[0] = i;
+ src = path->nodes[0];
- for (; i < nritems; i++) {
+ for (int i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -4157,19 +4238,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4194,8 +4278,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
+ struct btrfs_key key;
int ret;
+ btrfs_get_inode_key(inode, &key);
/*
* If we are doing a fast fsync and the inode was logged before in the
* current transaction, then we know the inode was previously logged and
@@ -4207,7 +4293,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* already exists can also result in unnecessarily splitting a leaf.
*/
if (!inode_item_dropped && inode->logged_trans == trans->transid) {
- ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
ASSERT(ret <= 0);
if (ret > 0)
ret = -ENOENT;
@@ -4221,7 +4307,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
* flags and set ->logged_trans to 0.
*/
- ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
sizeof(*inode_item));
ASSERT(ret != -EEXIST);
}
@@ -4286,17 +4372,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
- u64 logged_isize)
+ u64 logged_isize, struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *src;
- int ret = 0;
+ int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
char *ins_data;
- int i;
int dst_index;
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4329,14 +4414,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* while the other is holding the delayed node's mutex and wants to
* write lock the same subvolume leaf for flushing delayed items.
*/
- src = btrfs_clone_extent_buffer(src_path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(src_path, ctx);
+ if (ret < 0)
+ return ret;
- i = src_path->slots[0];
- btrfs_release_path(src_path);
- src_path->nodes[0] = src;
- src_path->slots[0] = i;
+ src = src_path->nodes[0];
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4351,7 +4433,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.nr = 0;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
@@ -4426,9 +4508,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr += extent_offset;
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
if (!ret)
@@ -4458,7 +4541,7 @@ add_to_batch:
goto out;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
const int dst_slot = dst_path->slots[0] + dst_index;
struct btrfs_key key;
@@ -4538,16 +4621,17 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
{
struct btrfs_ordered_extent *ordered;
struct btrfs_root *csum_root;
+ u64 block_start;
u64 csum_offset;
u64 csum_len;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
+ u64 mod_start = em->start;
+ u64 mod_len = em->len;
LIST_HEAD(ordered_sums);
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- em->block_start == EXTENT_MAP_HOLE)
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
+ em->disk_bytenr == EXTENT_MAP_HOLE)
return 0;
list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
@@ -4609,21 +4693,23 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (extent_map_is_compressed(em)) {
csum_offset = 0;
- csum_len = max(em->block_len, em->orig_block_len);
+ csum_len = em->disk_num_bytes;
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
- csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
- ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0, false);
- if (ret)
+ block_start = extent_map_block_start(em);
+ csum_root = btrfs_csum_root(trans->fs_info, block_start);
+ ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
+ block_start + csum_offset + csum_len - 1,
+ &ordered_sums, false);
+ if (ret < 0)
return ret;
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
@@ -4649,30 +4735,32 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
struct btrfs_key key;
- u64 extent_offset = em->start - em->orig_start;
+ enum btrfs_compression_type compress_type;
+ u64 extent_offset = em->offset;
+ u64 block_start = extent_map_block_start(em);
u64 block_len;
int ret;
btrfs_set_stack_file_extent_generation(&fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ block_len = em->disk_num_bytes;
+ compress_type = extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
- extent_offset);
+ } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
}
btrfs_set_stack_file_extent_offset(&fi, extent_offset);
btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
- btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
@@ -4729,7 +4817,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key;
@@ -4795,7 +4884,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4850,7 +4939,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
@@ -4890,13 +4979,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
@@ -4913,7 +5002,7 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
continue;
}
@@ -4922,14 +5011,14 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
if (!ret)
- ret = btrfs_log_prealloc_extents(trans, inode, path);
+ ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
if (ret)
return ret;
@@ -4945,12 +5034,12 @@ process:
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
btrfs_put_ordered_extent(ordered);
}
@@ -5010,7 +5099,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
int ret;
@@ -5039,7 +5129,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5065,7 +5155,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
}
@@ -5419,7 +5509,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5435,7 +5524,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5462,17 +5551,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5514,14 +5602,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5599,7 +5686,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5690,12 +5777,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5731,7 +5818,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5767,9 +5854,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5785,8 +5871,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5797,8 +5883,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -5866,7 +5952,7 @@ again:
if (ret < 0) {
return ret;
} else if (ret > 0 &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ other_ino != btrfs_ino(ctx->inode)) {
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5875,7 +5961,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
- inode_only, logged_isize);
+ inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5894,7 +5980,7 @@ again:
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5911,7 +5997,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 1;
@@ -5926,7 +6012,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
- logged_isize);
+ logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5951,7 +6037,7 @@ next_key:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret)
return ret;
}
@@ -5962,7 +6048,7 @@ next_key:
* lock the same leaf with btrfs_log_prealloc_extents() below.
*/
btrfs_release_path(path);
- ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
}
return ret;
@@ -6290,7 +6376,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6306,8 +6392,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6315,12 +6401,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6553,7 +6639,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
xattrs_logged = true;
@@ -6580,7 +6666,7 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
btrfs_release_path(path);
@@ -6728,7 +6814,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
- struct inode *dir_inode;
+ struct btrfs_inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
@@ -6777,18 +6863,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -6813,7 +6897,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6828,11 +6912,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -7013,6 +7096,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
}
/*
+ * If we're logging an inode from a subvolume created in the current
+ * transaction we must force a commit since the root is not persisted.
+ */
+ if (btrfs_root_generation(&root->root_item) == trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ /*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
@@ -7222,9 +7314,7 @@ again:
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans,
- log->node->start,
- log->node->len);
+ ret = btrfs_pin_extent_for_log_replay(trans, log->node);
btrfs_put_root(log);
if (!ret)
@@ -7235,11 +7325,14 @@ again:
wc.replay_dest->log_root = log;
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
+ if (ret) {
/* The loop needs to continue due to the root refs */
btrfs_abort_transaction(trans, ret);
- else
+ } else {
ret = walk_log_tree(trans, log, &wc);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
@@ -7395,6 +7488,26 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
}
/*
+ * Call this when creating a subvolume in a directory.
+ * Because we don't commit a transaction when creating a subvolume, we can't
+ * allow the directory pointing to the subvolume to be logged with an entry that
+ * points to an unpersisted root if we are still in the transaction used to
+ * create the subvolume, so make any attempt to log the directory to result in a
+ * full log sync.
+ * Also we don't need to worry with renames, since btrfs_rename() marks the log
+ * for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
+ */
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/*
* Update the log after adding a new name for an inode.
*
* @trans: Transaction handle.
@@ -7526,8 +7639,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ btrfs_init_log_ctx(&ctx, inode);
ctx.logging_new_name = true;
+ btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7536,6 +7650,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a550a8a375cd..dc313e6bb2fa 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,10 +6,18 @@
#ifndef BTRFS_TREE_LOG_H
#define BTRFS_TREE_LOG_H
+#include <linux/list.h>
+#include <linux/fs.h>
#include "messages.h"
#include "ctree.h"
#include "transaction.h"
+struct inode;
+struct dentry;
+struct btrfs_ordered_extent;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
@@ -29,44 +37,27 @@ struct btrfs_log_ctx {
bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
struct list_head conflict_inodes;
int num_conflict_inodes;
bool logging_conflict_inodes;
+ /*
+ * Used for fsyncs that need to copy items from the subvolume tree to
+ * the log tree (full sync flag set or copy everything flag set) to
+ * avoid allocating a temporary extent buffer while holding a lock on
+ * an extent buffer of the subvolume tree and under the log transaction.
+ * Also helps to avoid allocating and freeing a temporary extent buffer
+ * in case we need to process multiple leaves from the subvolume tree.
+ */
+ struct extent_buffer *scratch_eb;
};
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
- struct inode *inode)
-{
- ctx->log_ret = 0;
- ctx->log_transid = 0;
- ctx->log_new_dentries = false;
- ctx->logging_new_name = false;
- ctx->logging_new_delayed_dentries = false;
- ctx->logged_before = false;
- ctx->inode = inode;
- INIT_LIST_HEAD(&ctx->list);
- INIT_LIST_HEAD(&ctx->ordered_extents);
- INIT_LIST_HEAD(&ctx->conflict_inodes);
- ctx->num_conflict_inodes = 0;
- ctx->logging_conflict_inodes = false;
-}
-
-static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
-{
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_extent *tmp;
-
- ASSERT(inode_is_locked(ctx->inode));
-
- list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
- list_del_init(&ordered->log_list);
- btrfs_put_ordered_extent(ordered);
- }
-}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode);
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx);
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx);
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
{
@@ -103,6 +94,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
bool for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct dentry *old_dentry, struct btrfs_inode *old_dir,
u64 old_dir_index, struct dentry *parent);
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 3df6153d5d5a..b382a4c443d4 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -44,7 +44,7 @@ struct tree_mod_elem {
/*
* Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
@@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
-static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
@@ -188,8 +187,8 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
}
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
@@ -199,7 +198,7 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
return true;
}
-static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
int slot,
enum btrfs_mod_log_op op)
{
@@ -222,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
return tm;
}
-int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
@@ -259,7 +258,7 @@ out_unlock:
return ret;
}
-static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
+static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
@@ -279,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
return tm;
}
-int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
@@ -367,9 +366,9 @@ free_tms:
return ret;
}
-static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
{
int i, j;
int ret;
@@ -536,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
}
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src,
+ const struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items)
@@ -1005,7 +1004,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
check.level = level;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
old = read_tree_block(fs_info, logical, &check);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 94f10afeee97..6308c577a4a4 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -3,7 +3,13 @@
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
-#include "ctree.h"
+#include <linux/list.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_seq_list;
/* Represents a tree mod log user. */
struct btrfs_seq_list {
@@ -31,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal);
-int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
@@ -41,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src,
+ const struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items);
-int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items);
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 33606025513d..fc59b57257d6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -7,7 +7,6 @@
#include <linux/slab.h>
#include "messages.h"
#include "ulist.h"
-#include "ctree.h"
/*
* ulist is a generic data structure to hold a collection of unique u64
@@ -51,6 +50,7 @@ void ulist_init(struct ulist *ulist)
INIT_LIST_HEAD(&ulist->nodes);
ulist->root = RB_ROOT;
ulist->nnodes = 0;
+ ulist->prealloc = NULL;
}
/*
@@ -69,6 +69,8 @@ void ulist_release(struct ulist *ulist)
list_for_each_entry_safe(node, next, &ulist->nodes, list) {
kfree(node);
}
+ kfree(ulist->prealloc);
+ ulist->prealloc = NULL;
ulist->root = RB_ROOT;
INIT_LIST_HEAD(&ulist->nodes);
}
@@ -106,6 +108,12 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
+void ulist_prealloc(struct ulist *ulist, gfp_t gfp_mask)
+{
+ if (!ulist->prealloc)
+ ulist->prealloc = kzalloc(sizeof(*ulist->prealloc), gfp_mask);
+}
+
/*
* Free dynamically allocated ulist.
*
@@ -207,9 +215,15 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
*old_aux = node->aux;
return 0;
}
- node = kmalloc(sizeof(*node), gfp_mask);
- if (!node)
- return -ENOMEM;
+
+ if (ulist->prealloc) {
+ node = ulist->prealloc;
+ ulist->prealloc = NULL;
+ } else {
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
+ }
node->val = val;
node->aux = aux;
@@ -223,7 +237,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
}
/*
- * ulist_del - delete one node from ulist
+ * Delete one node from ulist.
+ *
* @ulist: ulist to remove node from
* @val: value to delete
* @aux: aux to delete
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index b2cef187ea8e..c62a372f1462 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -7,6 +7,7 @@
#ifndef BTRFS_ULIST_H
#define BTRFS_ULIST_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
@@ -40,12 +41,14 @@ struct ulist {
struct list_head nodes;
struct rb_root root;
+ struct ulist_node *prealloc;
};
void ulist_init(struct ulist *ulist);
void ulist_release(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_prealloc(struct ulist *ulist, gfp_t mask);
void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 5be74f9e47eb..aca2861f2187 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -3,18 +3,19 @@
* Copyright (C) STRATO AG 2013. All rights reserved.
*/
+#include <linux/kthread.h>
#include <linux/uuid.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "messages.h"
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "fs.h"
#include "accessors.h"
#include "uuid-tree.h"
+#include "ioctl.h"
-static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key)
{
key->type = type;
key->objectid = get_unaligned_le64(uuid);
@@ -22,7 +23,7 @@ static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
}
/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
-static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
u8 type, u64 subid)
{
int ret;
@@ -82,7 +83,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid_cpu)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -114,7 +115,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
sizeof(subid_le));
- if (ret >= 0) {
+ if (ret == 0) {
/* Add an item for the type for the first time */
eb = path->nodes[0];
slot = path->slots[0];
@@ -146,7 +147,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -257,7 +258,7 @@ out:
* < 0 if an error occurred
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subvolid)
+ const u8 *uuid, u8 type, u64 subvolid)
{
int ret = 0;
struct btrfs_root *subvol_root;
@@ -391,3 +392,180 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_uuid_scan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_root_item root_item;
+ u32 item_size;
+ struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+ if (key.type != BTRFS_ROOT_ITEM_KEY ||
+ (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+ key.objectid > BTRFS_LAST_FREE_OBJECTID)
+ goto skip;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(eb, slot);
+ if (item_size < sizeof(root_item))
+ goto skip;
+
+ read_extent_buffer(eb, &root_item,
+ btrfs_item_ptr_offset(eb, slot),
+ (int)sizeof(root_item));
+ if (btrfs_root_refs(&root_item) == 0)
+ goto skip;
+
+ if (!btrfs_is_empty_uuid(root_item.uuid) ||
+ !btrfs_is_empty_uuid(root_item.received_uuid)) {
+ if (trans)
+ goto update_tree;
+
+ btrfs_release_path(path);
+ /*
+ * 1 - subvol uuid item
+ * 1 - received_subvol uuid item
+ */
+ trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ continue;
+ } else {
+ goto skip;
+ }
+update_tree:
+ btrfs_release_path(path);
+ if (!btrfs_is_empty_uuid(root_item.uuid)) {
+ ret = btrfs_uuid_tree_add(trans, root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+ if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans,
+ root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+skip:
+ btrfs_release_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans);
+ trans = NULL;
+ if (ret)
+ break;
+ }
+
+ if (key.offset < (u64)-1) {
+ key.offset++;
+ } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ } else if (key.objectid < (u64)-1) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.objectid++;
+ } else {
+ break;
+ }
+ cond_resched();
+ }
+
+out:
+ btrfs_free_path(path);
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans);
+ if (ret)
+ btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+ else if (!closing)
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return 0;
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *uuid_root;
+ struct task_struct *task;
+ int ret;
+
+ /*
+ * 1 - root node
+ * 1 - root item
+ */
+ trans = btrfs_start_transaction(tree_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
+ if (IS_ERR(uuid_root)) {
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ fs_info->uuid_root = uuid_root;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_scan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 5350c87fe2ca..c60ad20325cc 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -3,10 +3,17 @@
#ifndef BTRFS_UUID_TREE_H
#define BTRFS_UUID_TREE_H
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_uuid_scan_kthread(void *data);
#endif
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 744f4f4d4c68..e97ad824ae16 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -14,7 +14,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
-#include "disk-io.h"
#include "locking.h"
#include "fs.h"
#include "accessors.h"
@@ -285,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
* page and ignore dest, but it must still be non-NULL to avoid the
* counting-only behavior.
* @len: length in bytes to read
- * @dest_page: copy into this page instead of the dest buffer
+ * @dest_folio: copy into this folio instead of the dest buffer
*
* Helper function to read items from the btree. This returns the number of
* bytes read or < 0 for errors. We can return short reads if the items don't
@@ -295,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
* Returns number of bytes read or a negative error code on failure.
*/
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
- char *dest, u64 len, struct page *dest_page)
+ char *dest, u64 len, struct folio *dest_folio)
{
struct btrfs_path *path;
struct btrfs_root *root = inode->root;
@@ -315,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
if (!path)
return -ENOMEM;
- if (dest_page)
+ if (dest_folio)
path->reada = READA_FORWARD;
key.objectid = btrfs_ino(inode);
@@ -372,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
copy_offset = offset - key.offset;
if (dest) {
- if (dest_page)
- kaddr = kmap_local_page(dest_page);
+ if (dest_folio)
+ kaddr = kmap_local_folio(dest_folio, 0);
data = btrfs_item_ptr(leaf, path->slots[0], void);
read_extent_buffer(leaf, kaddr + dest_offset,
(unsigned long)data + copy_offset,
copy_bytes);
- if (dest_page)
+ if (dest_folio)
kunmap_local(kaddr);
}
@@ -461,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
int ret;
- ASSERT(inode_is_locked(&inode->vfs_inode));
+ btrfs_assert_inode_locked(inode);
truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
ret = btrfs_drop_verity_items(inode);
@@ -487,7 +486,7 @@ static int rollback_verity(struct btrfs_inode *inode)
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -554,7 +553,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
ret = del_orphan(trans, inode);
@@ -586,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp)
struct btrfs_trans_handle *trans;
int ret;
- ASSERT(inode_is_locked(file_inode(filp)));
+ btrfs_assert_inode_locked(inode);
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
return -EBUSY;
@@ -634,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc,
int ret = 0;
int rollback_ret;
- ASSERT(inode_is_locked(file_inode(filp)));
+ btrfs_assert_inode_locked(inode);
if (desc == NULL)
goto rollback;
@@ -763,7 +762,7 @@ again:
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
*/
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
- folio_address(folio), PAGE_SIZE, &folio->page);
+ folio_address(folio), PAGE_SIZE, folio);
if (ret < 0) {
folio_put(folio);
return ERR_PTR(ret);
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
index 91c10f7d0a46..d696659e43e4 100644
--- a/fs/btrfs/verity.h
+++ b/fs/btrfs/verity.h
@@ -3,8 +3,13 @@
#ifndef BTRFS_VERITY_H
#define BTRFS_VERITY_H
+struct inode;
+struct btrfs_inode;
+
#ifdef CONFIG_FS_VERITY
+#include <linux/fsverity.h>
+
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
@@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
#else
+#include <linux/errno.h>
+
static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
{
return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9a0b26d08e1..58e0cac5779d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,10 +14,8 @@
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "rcu-string.h"
@@ -35,11 +33,23 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -357,21 +367,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
}
/*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
+ * fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
- const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
- ASSERT(fsid || !metadata_fsid);
-
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
@@ -385,8 +393,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
if (fsid) {
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
- memcpy(fs_devs->metadata_uuid,
- metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
}
return fs_devs;
@@ -457,91 +464,46 @@ static noinline struct btrfs_fs_devices *find_fsid(
return NULL;
}
-/*
- * First check if the metadata_uuid is different from the fsid in the given
- * fs_devices. Then check if the given fsid is the same as the metadata_uuid
- * in the fs_devices. If it is, return true; otherwise, return false.
- */
-static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
-{
- return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
-}
-
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
- struct btrfs_super_block *disk_super)
-{
-
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
- fs_devices->fsid))
- return fs_devices;
- }
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
-}
-
-
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct block_device **bdev,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
+ struct block_device *bdev;
int ret;
- *bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev)) {
- ret = PTR_ERR(*bdev);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
+ btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
+ device_path, flags, ret);
goto error;
}
+ bdev = file_bdev(*bdev_file);
if (flush)
- sync_blockdev(*bdev);
- ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
- if (ret) {
- blkdev_put(*bdev, holder);
- goto error;
+ sync_blockdev(bdev);
+ if (holder) {
+ ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ fput(*bdev_file);
+ goto error;
+ }
}
- invalidate_bdev(*bdev);
- *disk_super = btrfs_read_dev_super(*bdev);
+ invalidate_bdev(bdev);
+ *disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- blkdev_put(*bdev, holder);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev = NULL;
+ *disk_super = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -562,13 +524,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
- int ret = 0;
+ int ret;
+ bool freed = false;
lockdep_assert_held(&uuid_mutex);
- if (devt)
- ret = -ENOENT;
-
+ /* Return good status if there is no instance of devt. */
+ ret = 0;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
@@ -579,8 +541,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
- /* for an already deleted device return 0 */
- if (devt && ret != 0)
+ if (devt)
ret = -EBUSY;
break;
}
@@ -590,7 +551,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
list_del(&device->dev_list);
btrfs_free_device(device);
- ret = 0;
+ freed = true;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -601,9 +562,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
}
}
+ /* If there is at least one freed device return 0. */
+ if (freed)
+ return 0;
+
return ret;
}
+static struct btrfs_fs_devices *find_fsid_by_device(
+ struct btrfs_super_block *disk_super,
+ dev_t devt, bool *same_fsid_diff_dev)
+{
+ struct btrfs_fs_devices *fsid_fs_devices;
+ struct btrfs_fs_devices *devt_fs_devices;
+ const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool found_by_devt = false;
+
+ /* Find the fs_device by the usual method, if found use it. */
+ fsid_fs_devices = find_fsid(disk_super->fsid,
+ has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+ /* The temp_fsid feature is supported only with single device filesystem. */
+ if (btrfs_super_num_devices(disk_super) != 1)
+ return fsid_fs_devices;
+
+ /*
+ * A seed device is an integral component of the sprout device, which
+ * functions as a multi-device filesystem. So, temp-fsid feature is
+ * not supported.
+ */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+ return fsid_fs_devices;
+
+ /* Try to find a fs_devices by matching devt. */
+ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+ if (device->devt == devt) {
+ found_by_devt = true;
+ break;
+ }
+ }
+ if (found_by_devt)
+ break;
+ }
+
+ if (found_by_devt) {
+ /* Existing device. */
+ if (fsid_fs_devices == NULL) {
+ if (devt_fs_devices->opened == 0) {
+ /* Stale device. */
+ return NULL;
+ } else {
+ /* temp_fsid is mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* Regular or temp_fsid device mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* New device. */
+ if (fsid_fs_devices == NULL) {
+ return NULL;
+ } else {
+ /* sb::fsid is already used create a new temp_fsid. */
+ *same_fsid_diff_dev = true;
+ return NULL;
+ }
+ }
+
+ /* Not reached. */
+}
+
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
@@ -613,7 +646,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct block_device *bdev;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -624,7 +657,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -648,21 +681,31 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev = bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- device->holder = holder;
+
+ if (device->devt != device->bdev->bd_dev) {
+ btrfs_warn(NULL,
+ "device %s maj:min changed from %d:%d to %d:%d",
+ device->name->str, MAJOR(device->devt),
+ MINOR(device->devt), MAJOR(device->bdev->bd_dev),
+ MINOR(device->bdev->bd_dev));
+
+ device->devt = device->bdev->bd_dev;
+ }
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,12 +719,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, holder);
+ fput(bdev_file);
return -EINVAL;
}
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
{
bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -726,84 +769,6 @@ out:
}
/*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, NULL);
-}
-
-static struct btrfs_fs_devices *find_fsid_changed(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but currently device didn't
- * observe it. Meaning our fsid will be different than theirs. We need
- * to handle two subcases :
- * 1 - The fs still continues to have different METADATA/FSID uuids.
- * 2 - The fs is switched back to its original FSID (METADATA/FSID
- * are equal).
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- /* Changed UUIDs */
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
- memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0)
- return fs_devices;
-
- /* Unchanged UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
-
- return NULL;
-}
-
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle the case where the scanned device is part of an fs whose last
- * metadata UUID change reverted it to the original FSID. At the same
- * time fs_devices was first created by another constituent device
- * which didn't fully observe the operation. This results in an
- * btrfs_fs_devices created with metadata/fsid different AND
- * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
- * fs_devices equal to the FSID of the disk.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return NULL;
-}
-/*
* Add new device to list of registered devices
*
* Returns:
@@ -821,10 +786,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
int error;
+ bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
- bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
- BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+ path);
+ return ERR_PTR(-EAGAIN);
+ }
error = lookup_bdev(path, &path_devt);
if (error) {
@@ -833,27 +804,24 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(error);
}
- if (fsid_change_in_progress) {
- if (!has_metadata_uuid)
- fs_devices = find_fsid_inprogress(disk_super);
- else
- fs_devices = find_fsid_changed(disk_super);
- } else if (has_metadata_uuid) {
- fs_devices = find_fsid_with_metadata_uuid(disk_super);
- } else {
- fs_devices = find_fsid_reverted_metadata(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
- }
-
+ fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
if (!fs_devices) {
- fs_devices = alloc_fs_devices(disk_super->fsid,
- has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+ fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- fs_devices->fsid_change = fsid_change_in_progress;
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ if (same_fsid_diff_dev) {
+ generate_random_uuid(fs_devices->fsid);
+ fs_devices->temp_fsid = true;
+ pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid);
+ }
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
@@ -868,18 +836,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, &args);
- /*
- * If this disk has been pulled into an fs devices created by
- * a device which had the CHANGING_FSID_V2 flag then replace the
- * metadata_uuid/fsid values of the fs_devices.
- */
- if (fs_devices->fsid_change &&
- found_transid > fs_devices->latest_generation) {
+ if (found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
memcpy(fs_devices->metadata_uuid,
btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
- fs_devices->fsid_change = false;
}
}
@@ -888,8 +849,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (fs_devices->opened) {
btrfs_err(NULL,
-"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
- path, fs_devices->fsid, current->comm,
+"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid, current->comm,
task_pid_nr(current));
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
@@ -915,13 +877,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (disk_super->label[0])
pr_info(
- "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->label, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
else
pr_info(
- "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->fsid, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
} else if (!device->name || !is_same_device(device, path)) {
@@ -1033,7 +997,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
lockdep_assert_held(&uuid_mutex);
- fs_devices = alloc_fs_devices(orig->fsid, NULL);
+ fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -1104,9 +1068,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev) {
- blkdev_put(device->bdev, device->holder);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1151,7 +1116,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- blkdev_put(device->bdev, device->holder);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1176,6 +1141,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
@@ -1363,7 +1329,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -1396,30 +1362,70 @@ int btrfs_forget_devices(dev_t devt)
return ret;
}
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+ const char *path, dev_t devt,
+ bool mount_arg_dev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Do not skip device registration for mounted devices with matching
+ * maj:min but different paths. Booting without initrd relies on
+ * /dev/root initially, later replaced with the actual root device.
+ * A successful scan ensures grub2-probe selects the correct device.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ if (!fs_devices->opened) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ continue;
+ }
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev && (device->bdev->bd_dev == devt) &&
+ strcmp(device->name->str, path) != 0) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Do not skip registration. */
+ return false;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+ }
+
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+ return true;
+
+ return false;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct block_device *bdev;
- u64 bytenr, bytenr_orig;
+ struct file *bdev_file;
+ u64 bytenr;
+ dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
/*
- * we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
-
- /*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
* resulting in failure.
@@ -1429,31 +1435,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev = blkdev_get_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
- bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ /*
+ * We would like to check all the super blocks, but doing so would
+ * allow a mount to succeed after a mkfs from a different filesystem.
+ * Currently, recovery from a bad primary btrfs superblock is done
+ * using the userspace command 'btrfs check --super'.
+ */
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
+ btrfs_sb_offset(0));
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+ pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ path, MAJOR(devt), MINOR(devt));
+
+ btrfs_free_stale_devices(devt, NULL);
+
+ device = NULL;
+ goto free_disk_super;
+ }
+
device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
+free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- blkdev_put(bdev, NULL);
+ fput(bdev_file);
return device;
}
@@ -1825,19 +1849,18 @@ out:
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -2105,11 +2128,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
copy_num, ret);
}
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
{
int copy_num;
+ struct block_device *bdev = device->bdev;
if (!bdev)
return;
@@ -2125,12 +2147,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
+ update_dev_time(device->name->str);
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2239,7 +2261,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2255,21 +2277,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and blkdev_put() will pull in the ->open_mutex on the
- * block device and it's dependencies. Instead just flush the device
- * and let the caller do the final blkdev_put.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
- btrfs_scratch_superblocks(fs_info, device->bdev,
- device->name->str);
+ btrfs_scratch_superblocks(fs_info, device);
if (device->bdev) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
}
- *bdev = device->bdev;
- *holder = device->holder;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2375,8 +2395,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
- tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
@@ -2406,7 +2425,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct block_device *bdev;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2424,7 +2443,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2437,7 +2456,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, NULL);
+ fput(bdev_file);
return 0;
}
@@ -2494,7 +2513,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
* Private copy of the seed devices, anchored at
* fs_info->fs_devices->seed_list
*/
- seed_devices = alloc_fs_devices(NULL, NULL);
+ seed_devices = alloc_fs_devices(NULL);
if (IS_ERR(seed_devices))
return seed_devices;
@@ -2657,7 +2676,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct block_device *bdev;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2670,12 +2689,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2687,11 +2706,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2707,7 +2726,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev = bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2728,14 +2748,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes =
- round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+ round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
/* GFP_KERNEL allocation must not be under device_list_mutex */
@@ -2767,7 +2786,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(device->bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2887,7 +2906,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -2968,6 +2987,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
+ atomic64_add(diff, &fs_info->free_chunk_space);
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
@@ -3004,16 +3024,19 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_handle_fs_error(fs_info, -ENOENT,
- "Failed lookup while freeing chunk.");
- ret = -ENOENT;
+ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
+ chunk_offset);
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -EUCLEAN;
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0)
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to delete chunk item.");
+ if (ret < 0) {
+ btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
out:
btrfs_free_path(path);
return ret;
@@ -3065,45 +3088,118 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
+ if (unlikely(!map)) {
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len <= logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
- logical, logical + length, em->start, em->start + em->len);
- free_extent_map(em);
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3128,23 +3224,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- return PTR_ERR(em);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3174,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3247,7 +3347,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3266,7 +3366,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, map);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3278,7 +3378,7 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -3489,6 +3589,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3633,7 +3771,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
- BUG_ON(!fs_info->balance_ctl);
+ ASSERT(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
@@ -4687,183 +4825,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_uuid_scan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = data;
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_key key;
- struct btrfs_path *path = NULL;
- int ret = 0;
- struct extent_buffer *eb;
- int slot;
- struct btrfs_root_item root_item;
- u32 item_size;
- struct btrfs_trans_handle *trans = NULL;
- bool closing = false;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- key.objectid = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = 0;
-
- while (1) {
- if (btrfs_fs_closing(fs_info)) {
- closing = true;
- break;
- }
- ret = btrfs_search_forward(root, &key, path,
- BTRFS_OLDEST_GENERATION);
- if (ret) {
- if (ret > 0)
- ret = 0;
- break;
- }
-
- if (key.type != BTRFS_ROOT_ITEM_KEY ||
- (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
- key.objectid != BTRFS_FS_TREE_OBJECTID) ||
- key.objectid > BTRFS_LAST_FREE_OBJECTID)
- goto skip;
-
- eb = path->nodes[0];
- slot = path->slots[0];
- item_size = btrfs_item_size(eb, slot);
- if (item_size < sizeof(root_item))
- goto skip;
-
- read_extent_buffer(eb, &root_item,
- btrfs_item_ptr_offset(eb, slot),
- (int)sizeof(root_item));
- if (btrfs_root_refs(&root_item) == 0)
- goto skip;
-
- if (!btrfs_is_empty_uuid(root_item.uuid) ||
- !btrfs_is_empty_uuid(root_item.received_uuid)) {
- if (trans)
- goto update_tree;
-
- btrfs_release_path(path);
- /*
- * 1 - subvol uuid item
- * 1 - received_subvol uuid item
- */
- trans = btrfs_start_transaction(fs_info->uuid_root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- continue;
- } else {
- goto skip;
- }
-update_tree:
- btrfs_release_path(path);
- if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
- if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans,
- root_item.received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
-skip:
- btrfs_release_path(path);
- if (trans) {
- ret = btrfs_end_transaction(trans);
- trans = NULL;
- if (ret)
- break;
- }
-
- if (key.offset < (u64)-1) {
- key.offset++;
- } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- } else if (key.objectid < (u64)-1) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.objectid++;
- } else {
- break;
- }
- cond_resched();
- }
-
-out:
- btrfs_free_path(path);
- if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans);
- if (ret)
- btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else if (!closing)
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
- up(&fs_info->uuid_tree_rescan_sem);
- return 0;
-}
-
-int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *uuid_root;
- struct task_struct *task;
- int ret;
-
- /*
- * 1 - root node
- * 1 - root item
- */
- trans = btrfs_start_transaction(tree_root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
- if (IS_ERR(uuid_root)) {
- ret = PTR_ERR(uuid_root);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- fs_info->uuid_root = uuid_root;
-
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_scan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -4889,6 +4850,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
+ u64 free_diff = 0;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
@@ -4914,7 +4876,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
- atomic64_sub(diff, &fs_info->free_chunk_space);
+
+ /*
+ * The new free_chunk_space is new_size - used, so we have to
+ * subtract the delta of the old free_chunk_space which included
+ * old_size - used. If used > new_size then just subtract this
+ * entire device's free space.
+ */
+ if (device->bytes_used < new_size)
+ free_diff = (old_size - device->bytes_used) -
+ (new_size - device->bytes_used);
+ else
+ free_diff = old_size - device->bytes_used;
+ atomic64_sub(free_diff, &fs_info->free_chunk_space);
}
/*
@@ -5049,9 +5023,10 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes += diff;
- atomic64_add(diff, &fs_info->free_chunk_space);
+ atomic64_add(free_diff, &fs_info->free_chunk_space);
+ }
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -5422,69 +5397,140 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ p = &fs_info->mapping_tree.rb_root.rb_node;
+ while (*p) {
+ struct btrfs_chunk_map *entry;
+
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+ if (map->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (map->start > entry->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&map->rb_node, parent, p);
+ rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
- int i;
- int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
- for (i = 0; i < ctl->ndevs; ++i) {
- for (j = 0; j < ctl->dev_stripes; ++j) {
+ for (int i = 0; i < ctl->ndevs; i++) {
+ for (int j = 0; j < ctl->dev_stripes; j++) {
int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
j * ctl->stripe_size;
}
}
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5497,23 +5543,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5589,8 +5622,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5619,14 +5651,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
@@ -5683,7 +5714,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5728,7 +5759,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5737,17 +5768,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5768,38 +5797,57 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
+}
+
+static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+
+ /*
+ * There could be two corrupted data stripes, we need to loop retry in
+ * order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the stripe under
+ * reconstruction.
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return map->num_stripes;
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ return btrfs_raid_array[index].ncopies;
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
- enum btrfs_raid_types index;
- int ret = 1;
+ struct btrfs_chunk_map *map;
+ int ret;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5808,72 +5856,53 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
- index = btrfs_bg_flags_to_raid_index(map->type);
-
- /* Non-RAID56, use their ncopies from btrfs_raid_array. */
- if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
- ret = btrfs_raid_array[index].ncopies;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- ret = 2;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- /*
- * There could be two corrupted data stripes, we need
- * to loop retry in order to rebuild the correct data.
- *
- * Fail a stripe at a time on every retry except the
- * stripe under reconstruction.
- */
- ret = map->num_stripes;
- free_extent_map(em);
+ ret = btrfs_chunk_map_num_copies(map);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
if (!btrfs_fs_incompat(fs_info, RAID56))
return len;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
+ struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
+ const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
int num_stripes;
int preferred_mirror;
@@ -5888,13 +5917,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- switch (fs_info->fs_devices->read_policy) {
+ switch (policy) {
default:
/* Shouldn't happen, just warn and use pid instead of failing */
- btrfs_warn_rl(fs_info,
- "unknown read_policy type %u, reset to pid",
- fs_info->fs_devices->read_policy);
- fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
+ policy);
+ WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
fallthrough;
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
@@ -5931,6 +5959,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical,
u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -5950,6 +5979,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->replace_stripe_src = -1;
bioc->full_stripe_logical = (u64)-1;
+ bioc->logical = logical;
return bioc;
}
@@ -5976,8 +6006,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
@@ -5995,11 +6024,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int ret;
int i;
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -6007,8 +6034,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
/*
@@ -6105,10 +6132,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
}
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return stripes;
out_free_map:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
@@ -6129,20 +6156,19 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
return ret;
}
-static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_io_context *bioc,
+static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_dev_replace *dev_replace,
u64 logical,
- int *num_stripes_ret, int *max_errors_ret)
+ struct btrfs_io_geometry *io_geom)
{
u64 srcdev_devid = dev_replace->srcdev->devid;
/*
* At this stage, num_stripes is still the real number of stripes,
* excluding the duplicated stripes.
*/
- int num_stripes = *num_stripes_ret;
+ int num_stripes = io_geom->num_stripes;
+ int max_errors = io_geom->max_errors;
int nr_extra_stripes = 0;
- int max_errors = *max_errors_ret;
int i;
/*
@@ -6183,7 +6209,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* replace.
* If we have 2 extra stripes, only choose the one with smaller physical.
*/
- if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
@@ -6201,22 +6227,21 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
}
}
- *num_stripes_ret = num_stripes + nr_extra_stripes;
- *max_errors_ret = max_errors + nr_extra_stripes;
+ io_geom->num_stripes = num_stripes + nr_extra_stripes;
+ io_geom->max_errors = max_errors + nr_extra_stripes;
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u32 *stripe_nr, u64 *stripe_offset,
- u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
- *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(*stripe_offset < U32_MAX);
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6231,18 +6256,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* to go rounddown(), not round_down(), as nr_data_stripes is
* not ensured to be power of 2.
*/
- *full_stripe_start =
- btrfs_stripe_nr_to_offset(
- rounddown(*stripe_nr, nr_data_stripes(map)));
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(*full_stripe_start + full_stripe_len > offset);
- ASSERT(*full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
*/
- if (op == BTRFS_MAP_WRITE)
- return full_stripe_len - (offset - *full_stripe_start);
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
}
/*
@@ -6250,16 +6274,178 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return BTRFS_STRIPE_LEN - *stripe_offset;
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
return U64_MAX;
}
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- dst->dev = map->stripes[stripe_index].dev;
- dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
+
+ if (io_geom->op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type,
+ io_geom->stripe_index, dst);
+
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
+ return 0;
+}
+
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ enum btrfs_map_op op, int mirror_num)
+{
+ if (!smap)
+ return false;
+
+ if (num_alloc_stripes != 1)
+ return false;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ return false;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ return false;
+
+ return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
+
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
+
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
+ }
+
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ /*
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
+ */
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
}
/*
@@ -6296,54 +6482,42 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
* For RAID6 profile, mirror > 2 means mark another
* data/P stripe error and rebuild from the remaining
* stripes..
- *
- * @need_raid_map: (Used only for integrity checker) whether the map wants
- * a full stripe map (including all data and P/Q stripes)
- * for RAID56. Should always be 1 except integrity checker.
*/
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map)
+ struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- u64 stripe_offset;
- u32 stripe_nr;
- u32 stripe_index;
- int data_stripes;
- int i;
int ret = 0;
- int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
- int num_stripes;
int num_copies;
- int max_errors = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
u16 num_alloc_stripes;
- u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
- num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (mirror_num > num_copies)
- return -EINVAL;
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- map = em->map_lookup;
- data_stripes = nr_data_stripes(map);
+ num_copies = btrfs_chunk_map_num_copies(map);
+ if (io_geom.mirror_num > num_copies)
+ return -EINVAL;
- map_offset = logical - em->start;
- max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
- &stripe_offset, &raid56_full_stripe_start);
- *length = min_t(u64, em->len - map_offset, max_len);
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
if (dev_replace->replace_task != current)
down_read(&dev_replace->rwsem);
@@ -6356,110 +6530,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
up_read(&dev_replace->rwsem);
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- if (op == BTRFS_MAP_READ)
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_index = (stripe_nr % factor) * map->sub_stripes;
- stripe_nr /= factor;
-
- if (op != BTRFS_MAP_READ)
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
- /*
- * Push stripe_nr back to the start of the full stripe
- * For those cases needing a full stripe, @stripe_nr
- * is the full stripe number.
- *
- * Originally we go raid56_full_stripe_start / full_stripe_len,
- * but that can be expensive. Here we just divide
- * @stripe_nr with @data_stripes.
- */
- stripe_nr /= data_stripes;
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = btrfs_chunk_max_errors(map);
-
- /* Return the length to the full stripe end */
- *length = min(logical + *length,
- raid56_full_stripe_start + em->start +
- btrfs_stripe_nr_to_offset(data_stripes)) -
- logical;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- /*
- * Mirror #0 or #1 means the original data block.
- * Mirror #2 is RAID5 parity block.
- * Mirror #3 is RAID6 Q block.
- */
- stripe_index = stripe_nr % data_stripes;
- stripe_nr /= data_stripes;
- if (mirror_num > 1)
- stripe_index = data_stripes + mirror_num - 2;
-
- /* We distribute the parity blocks across stripes */
- stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num <= 1)
- mirror_num = 1;
- }
- } else {
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
/*
* After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- mirror_num = stripe_index + 1;
+ map_blocks_single(map, &io_geom);
+ break;
}
- if (stripe_index >= map->num_stripes) {
+ if (io_geom.stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
+ io_geom.stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
- num_alloc_stripes = num_stripes;
+ num_alloc_stripes = io_geom.num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ)
/*
@@ -6476,17 +6586,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (smap && num_alloc_stripes == 1 &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+ io_geom.mirror_num)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
- *mirror_num_ret = mirror_num;
+ *mirror_num_ret = io_geom.mirror_num;
*bioc_ret = NULL;
- ret = 0;
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+ bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
@@ -6500,8 +6609,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*
* It's still mostly the same as other profiles, just with extra rotation.
*/
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
- (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6510,37 +6619,52 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* In this case, we just add @stripe_nr with @i, then do the
* modulo, to reduce one modulo call.
*/
- bioc->full_stripe_logical = em->start +
- btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (i = 0; i < num_stripes; i++)
- set_io_stripe(&bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
+ }
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
- stripe_index++;
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
+ if (ret < 0)
+ break;
+ io_geom.stripe_index++;
}
}
+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
+ }
+
if (op != BTRFS_MAP_READ)
- max_errors = btrfs_chunk_max_errors(map);
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
- handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
}
*bioc_ret = bioc;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
out:
if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
@@ -6548,7 +6672,7 @@ out:
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -6720,12 +6844,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const struct map_lookup *map = em->map_lookup;
const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(em->len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -6794,9 +6917,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
@@ -6830,35 +6951,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6873,7 +6981,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = btrfs_calc_stripe_length(em);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6889,7 +6997,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
ret = PTR_ERR(map->stripes[i].dev);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
}
@@ -6898,15 +7006,13 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
&(map->stripes[i].dev->dev_state));
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
+ btrfs_free_chunk_map(map);
}
- free_extent_map(em);
return ret;
}
@@ -6954,7 +7060,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
- fs_devices = alloc_fs_devices(fsid, NULL);
+ fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -7216,26 +7322,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7253,18 +7354,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7757,20 +7855,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (!map) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7778,12 +7871,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = btrfs_calc_stripe_length(em);
+ stripe_len = btrfs_calc_stripe_length(map);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
@@ -7806,7 +7898,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -7852,32 +7944,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (map->num_stripes != map->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -8129,7 +8219,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
ASSERT(mirror_num > 0);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
- &bioc, smap, &mirror_ret, true);
+ &bioc, smap, &mirror_ret);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5203095318b0..4481575dd70f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,13 +6,28 @@
#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H
+#include <linux/blk_types.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
#include <linux/sort.h>
-#include <linux/btrfs.h>
-#include "async-thread.h"
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/log2.h>
+#include <linux/kobject.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
#include "messages.h"
-#include "tree-checker.h"
#include "rcu-string.h"
+struct block_device;
+struct bdev_handle;
+struct btrfs_fs_info;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
+struct btrfs_zoned_device_info;
+
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
/*
@@ -83,7 +98,10 @@ enum btrfs_raid_types {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
#define BTRFS_DEV_STATE_NO_READA (5)
-struct btrfs_zoned_device_info;
+/* Special value encoding failure to write primary super block. */
+#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2)
+
+struct btrfs_fs_devices;
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -96,13 +114,11 @@ struct btrfs_device {
u64 generation;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
- /* block device holder for blkdev_get/put */
- void *holder;
-
/*
* Device's major-minor number. Must be set even if the device is not
* opened (bdev == NULL), unless the device is missing.
@@ -135,6 +151,12 @@ struct btrfs_device {
/* type and info about this device */
u64 type;
+ /*
+ * Counter of super block write errors, values larger than
+ * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure.
+ */
+ atomic_t sb_write_errors;
+
/* minimal io size for this device */
u32 sector_size;
@@ -284,6 +306,25 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Checksum mode - offload it to workqueues or do it synchronously in
+ * btrfs_submit_chunk().
+ */
+enum btrfs_offload_csum_mode {
+ /*
+ * Choose offloading checksum or do it synchronously automatically.
+ * Do it synchronously if the checksum is fast, or offload to workqueues
+ * otherwise.
+ */
+ BTRFS_OFFLOAD_CSUM_AUTO,
+ /* Always offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_ON,
+ /* Never offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_OFF,
+};
+#endif
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
@@ -296,6 +337,19 @@ struct btrfs_fs_devices {
* - Following shall be true at all times:
* - metadata_uuid == btrfs_header::fsid
* - metadata_uuid == btrfs_dev_item::fsid
+ *
+ * - Relations between fsid and metadata_uuid in sb and fs_devices:
+ * - Normal:
+ * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
+ * sb->metadata_uuid == 0
+ *
+ * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
+ * fs_devices->fsid == sb->fsid
+ * fs_devices->metadata_uuid == sb->metadata_uuid
+ *
+ * - When in-memory fs_devices->temp_fsid is true
+ * fs_devices->fsid = random
+ * fs_devices->metadata_uuid == sb->fsid
*/
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -359,9 +413,10 @@ struct btrfs_fs_devices {
bool rotating;
/* Devices support TRIM/discard commands. */
bool discardable;
- bool fsid_change;
/* The filesystem is a seed filesystem. */
bool seeding;
+ /* The mount needs to use a randomly generated fsid. */
+ bool temp_fsid;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -374,6 +429,11 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Checksum mode - offload it or do it synchronously. */
+ enum btrfs_offload_csum_mode offload_csum_mode;
+#endif
};
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
@@ -387,12 +447,12 @@ struct btrfs_fs_devices {
struct btrfs_io_stripe {
struct btrfs_device *dev;
- union {
- /* Block mapping */
- u64 physical;
- /* For the endio handler */
- struct btrfs_io_context *bioc;
- };
+ /* Block mapping. */
+ u64 physical;
+ u64 length;
+ bool rst_search_commit_root;
+ /* For the endio handler. */
+ struct btrfs_io_context *bioc;
};
struct btrfs_discard_stripe {
@@ -420,11 +480,17 @@ struct btrfs_discard_stripe {
struct btrfs_io_context {
refcount_t refs;
struct btrfs_fs_info *fs_info;
- u64 map_type; /* get from map_lookup->type */
+ /* Taken from struct btrfs_chunk_map::type. */
+ u64 map_type;
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
+ u64 logical;
+ u64 size;
+ /* Raid stripe tree ordered entry. */
+ struct list_head rst_ordered_entry;
+
/*
* The total number of stripes, including the extra duplicated
* stripe for replace.
@@ -518,21 +584,33 @@ struct btrfs_raid_attr {
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-struct map_lookup {
+struct btrfs_chunk_map {
+ struct rb_node rb_node;
+ /* For mount time dev extent verification. */
+ int verified_stripes;
+ refcount_t refs;
+ u64 start;
+ u64 chunk_len;
+ u64 stripe_size;
u64 type;
int io_align;
int io_width;
int num_stripes;
int sub_stripes;
- int verified_stripes; /* For mount time dev extent verification */
struct btrfs_io_stripe stripes[];
};
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+ if (map && refcount_dec_and_test(&map->refs)) {
+ ASSERT(RB_EMPTY_NODE(&map->rb_node));
+ kfree(map);
+ }
+}
-struct btrfs_balance_args;
-struct btrfs_balance_progress;
struct btrfs_balance_control {
struct btrfs_balance_args data;
struct btrfs_balance_args meta;
@@ -587,7 +665,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
*
* @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
* than 4G. This does the proper type cast to avoid overflow.
@@ -602,8 +680,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map);
+ struct btrfs_io_stripe *smap, int *mirror_num_ret);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num);
@@ -614,10 +691,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -635,7 +713,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
@@ -653,8 +731,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
-int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_uuid_scan_kthread(void *data);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
@@ -669,13 +745,24 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
@@ -742,9 +829,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
@@ -753,6 +838,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b906f809650e..ce464cd8e0ac 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -24,7 +24,7 @@
#include "accessors.h"
#include "dir-item.h"
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
@@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* locks the inode's i_mutex before calling setxattr or removexattr.
*/
if (flags & XATTR_REPLACE) {
- ASSERT(inode_is_locked(inode));
+ btrfs_assert_inode_locked(BTRFS_I(inode));
di = btrfs_lookup_xattr(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
if (!di)
@@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name, void *buffer,
+ size_t size)
+{
+ int ret;
+ bool is_cap = false;
+
+ name = xattr_full_name(handler, name);
+
+ /*
+ * security.capability doesn't cache the results, so calls into us
+ * constantly to see if there's a capability xattr. Cache the result
+ * here in order to avoid wasting time doing lookups for xattrs we know
+ * don't exist.
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ is_cap = true;
+ if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+ return -ENODATA;
+ }
+
+ ret = btrfs_getxattr(inode, name, buffer, size);
+ if (ret == -ENODATA && is_cap)
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+ return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
+ const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
@@ -404,11 +451,11 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
}
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = btrfs_xattr_handler_get,
- .set = btrfs_xattr_handler_set,
+ .get = btrfs_xattr_handler_get_security,
+ .set = btrfs_xattr_handler_set_security,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -442,7 +489,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = {
.set = btrfs_xattr_handler_set_prop,
};
-const struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler * const btrfs_xattr_handlers[] = {
&btrfs_security_xattr_handler,
&btrfs_trusted_xattr_handler,
&btrfs_user_xattr_handler,
@@ -457,7 +504,7 @@ static int btrfs_initxattrs(struct inode *inode,
const struct xattr *xattr;
unsigned int nofs_flag;
char *name;
- int err = 0;
+ int ret = 0;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
@@ -468,19 +515,23 @@ static int btrfs_initxattrs(struct inode *inode,
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
- err = -ENOMEM;
+ ret = -ENOMEM;
break;
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = btrfs_setxattr(trans, inode, name, xattr->value,
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ ret = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
- if (err < 0)
+ if (ret < 0)
break;
}
memalloc_nofs_restore(nofs_flag);
- return err;
+ return ret;
}
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 1cd3fc0a8f17..8dc4cf49f6f0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,11 +6,15 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
-#include <linux/xattr.h>
+struct dentry;
+struct inode;
+struct qstr;
+struct xattr_handler;
+struct btrfs_trans_handle;
-extern const struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler * const btrfs_xattr_handlers[];
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size);
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const void *value, size_t size, int flags);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9f60d0bbd530..03958d1a53b0 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -18,7 +18,10 @@
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/refcount.h>
+#include "btrfs_inode.h"
#include "compression.h"
+#include "fs.h"
+#include "subpage.h"
/* workspace buffer size for s390 zlib hardware support */
#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
@@ -91,29 +94,35 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
char *data_in = NULL;
- char *cpage_out;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
+ char *cfolio_out;
+ int nr_folios = 0;
+ struct folio *in_folio = NULL;
+ struct folio *out_folio = NULL;
unsigned long bytes_left;
- unsigned int in_buf_pages;
+ unsigned int in_buf_folios;
unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ unsigned long nr_dest_folios = *out_folios;
+ const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const u64 orig_end = start + len;
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
- pr_warn("BTRFS: deflateInit failed\n");
+ ret = zlib_deflateInit(&workspace->strm, workspace->level);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib compression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
@@ -121,18 +130,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[0] = out_page;
- nr_pages = 1;
+ cfolio_out = folio_address(out_folio);
+ folios[0] = out_folio;
+ nr_folios = 1;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
workspace->strm.avail_out = PAGE_SIZE;
while (workspace->strm.total_in < len) {
@@ -142,43 +151,63 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*/
if (workspace->strm.avail_in == 0) {
bytes_left = len - workspace->strm.total_in;
- in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_pages > 1) {
+ in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_folios > 1) {
int i;
- for (i = 0; i < in_buf_pages; i++) {
+ /* S390 hardware acceleration path, not subpage. */
+ ASSERT(!btrfs_is_subpage(
+ inode_to_fs_info(mapping->host),
+ mapping));
+ for (i = 0; i < in_buf_folios; i++) {
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ data_in = kmap_local_folio(in_folio, 0);
copy_page(workspace->buf + i * PAGE_SIZE,
data_in);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = min(bytes_left,
+ in_buf_folios << PAGE_SHIFT);
} else {
+ unsigned int pg_off;
+ unsigned int cur_len;
+
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ data_in = kmap_local_folio(in_folio, pg_off);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = cur_len;
}
- workspace->strm.avail_in = min(bytes_left,
- (unsigned long) workspace->buf_size);
}
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
- if (ret != Z_OK) {
- pr_debug("BTRFS: deflate in loop returned %d\n",
- ret);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+ "zlib compression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -196,20 +225,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
/* we're all done */
if (workspace->strm.total_in >= len)
@@ -231,21 +260,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -EIO;
goto out;
} else if (workspace->strm.avail_out == 0) {
- /* get another page for the stream end */
- if (nr_pages == nr_dest_pages) {
+ /* Get another folio for the stream end. */
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
}
zlib_deflateEnd(&workspace->strm);
@@ -259,10 +288,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = workspace->strm.total_out;
*total_in = workspace->strm.total_in;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
@@ -275,13 +304,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
- unsigned long page_in_index = 0;
+ unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -301,9 +330,14 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
kunmap_local(data_in);
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -331,21 +365,26 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap_local(data_in);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
data_in = NULL;
break;
}
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
- if (ret != Z_STREAM_END)
+ if (unlikely(ret != Z_STREAM_END)) {
+ btrfs_err(cb->bbio.inode->root->fs_info,
+ "zlib decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(cb->bbio.inode->root),
+ btrfs_ino(cb->bbio.inode), cb->start);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
@@ -354,7 +393,7 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -380,8 +419,14 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(dest_folio));
return -EIO;
}
@@ -394,12 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
if (ret != Z_STREAM_END)
goto out;
- memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy);
out:
if (unlikely(to_copy != destlen)) {
- pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
- to_copy, destlen);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
+
+ btrfs_err(inode->root->fs_info,
+"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(dest_folio), to_copy, destlen);
ret = -EIO;
} else {
ret = 0;
@@ -408,7 +457,7 @@ out:
zlib_inflateEnd(&workspace->strm);
if (unlikely(to_copy < destlen))
- memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
+ folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index c4463c3f2068..53d8c49ec058 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -12,10 +12,8 @@
#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
-#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
-#include "super.h"
#include "fs.h"
#include "accessors.h"
#include "bio.h"
@@ -89,9 +87,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
@@ -120,12 +117,11 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
BTRFS_SUPER_INFO_SIZE;
@@ -146,7 +142,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
else
sector = zones[0].start;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
btrfs_release_disk_super(super[i]);
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
@@ -291,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
/* The emulated zone size is determined from the size of device extent */
static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -308,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (ret > 0)
+ return -EUCLEAN;
}
leaf = path->nodes[0];
dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
- ret = 0;
-
-out:
- btrfs_free_path(path);
-
- return ret;
+ return 0;
}
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -578,26 +567,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
kvfree(zones);
- switch (bdev_zoned_model(bdev)) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(bdev)) {
model = "host-managed zoned";
emulated = "";
- break;
- case BLK_ZONED_HA:
- model = "host-aware zoned";
- emulated = "";
- break;
- case BLK_ZONED_NONE:
+ } else {
model = "regular";
emulated = "emulated ";
- break;
- default:
- /* Just in case */
- btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
- bdev_zoned_model(bdev),
- rcu_str_deref(device->name));
- ret = -EOPNOTSUPP;
- goto out_free_zone_info;
}
btrfs_info_in_rcu(fs_info,
@@ -609,9 +584,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
out:
kvfree(zones);
-out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
-
return ret;
}
@@ -670,8 +643,7 @@ out:
return NULL;
}
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
+static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone)
{
unsigned int nr_zones = 1;
int ret;
@@ -688,8 +660,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- if (device->bdev &&
- bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ if (device->bdev && bdev_is_zoned(device->bdev)) {
btrfs_err(fs_info,
"zoned: mode not enabled but zoned device found: %pg",
device->bdev);
@@ -781,7 +752,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* Check mount options here, because we might change fs_info->zoned
* from fs_info->zone_size.
*/
- ret = btrfs_check_mountopts_zoned(fs_info);
+ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
if (ret)
return ret;
@@ -789,7 +760,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -798,18 +770,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
* Space cache writing is not COWed. Disable that to avoid write errors
* in sequential zones.
*/
- if (btrfs_test_opt(info, SPACE_CACHE)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_err(info, "zoned: space cache v1 is not supported");
return -EINVAL;
}
- if (btrfs_test_opt(info, NODATACOW)) {
+ if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
btrfs_err(info, "zoned: NODATACOW not supported");
return -EINVAL;
}
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "zoned: async discard ignored and disabled for zoned mode");
+ if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+ btrfs_info(info,
+ "zoned: async discard ignored and disabled for zoned mode");
+ btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+ }
return 0;
}
@@ -838,11 +813,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -988,11 +966,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -1010,11 +991,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1025,9 +1008,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1138,12 +1124,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1216,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
@@ -1251,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
if (!ret)
ret = -EUCLEAN;
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_previous_extent_item(root, path, cache->start);
if (ret) {
@@ -1259,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
ret = 0;
*offset_ret = 0;
}
- goto out;
+ return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
@@ -1271,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
if (!(found_key.objectid >= cache->start &&
found_key.objectid + length <= cache->start + cache->length)) {
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
- ret = 0;
-
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
struct zone_info {
@@ -1290,7 +1273,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct map_lookup *map)
+ struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1400,16 +1383,19 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(bg->fs_info,
- "zoned: profile DUP not yet supported on data bg");
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
return -EINVAL;
}
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
@@ -1436,16 +1422,128 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
bg->alloc_offset = zone_info[0].alloc_offset;
- bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+ return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+ struct btrfs_chunk_map *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ int i;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EIO;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg)) {
+ return -EIO;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ }
+
+ if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ else
+ bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+ struct btrfs_chunk_map *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+ struct btrfs_chunk_map *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ if ((i % map->sub_stripes) == 0) {
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+ }
+
return 0;
}
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
struct zone_info *zone_info = NULL;
@@ -1454,6 +1552,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
+ u64 profile;
if (!btrfs_is_zoned(fs_info))
return 0;
@@ -1466,21 +1565,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EIO;
}
- /* Get the chunk mapping */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
-
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, logical, length);
+ if (!map)
return -EINVAL;
- map = em->map_lookup;
-
- cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
- if (!cache->physical_map) {
- ret = -ENOMEM;
- goto out;
- }
+ cache->physical_map = map;
zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
if (!zone_info) {
@@ -1524,7 +1613,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
}
- switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ switch (profile) {
case 0: /* single */
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
@@ -1532,11 +1622,18 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
- /* non-single profiles are not supported yet */
default:
btrfs_err(fs_info, "zoned: profile %s not yet supported",
btrfs_bg_type_to_raid_name(map->type));
@@ -1544,12 +1641,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
+ profile != BTRFS_BLOCK_GROUP_RAID10) {
+ /*
+ * Detected broken write pointer. Make this block group
+ * unallocatable by setting the allocation pointer at the end of
+ * allocatable region. Relocating this block group will fix the
+ * mismatch.
+ *
+ * Currently, we cannot handle RAID0 or RAID10 case like this
+ * because we don't have a proper zone_capacity value. But,
+ * reading from this block group won't work anyway by a missing
+ * stripe.
+ */
+ cache->alloc_offset = cache->zone_capacity;
+ }
+
out:
- if (cache->alloc_offset > fs_info->zone_size) {
- btrfs_err(fs_info,
- "zoned: invalid write pointer %llu in block group %llu",
- cache->alloc_offset, cache->start);
- ret = -EIO;
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
}
if (cache->alloc_offset > cache->zone_capacity) {
@@ -1578,12 +1693,11 @@ out:
spin_unlock(&fs_info->zone_active_bgs_lock);
}
} else {
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
cache->physical_map = NULL;
}
bitmap_free(active);
kfree(zone_info);
- free_extent_map(em);
return ret;
}
@@ -1606,22 +1720,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
cache->zone_unusable = unusable;
}
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb)
-{
- if (!btrfs_is_zoned(eb->fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
- return;
-
- ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
- memzero_extent_buffer(eb, 0, eb->len);
- set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
- set_extent_buffer_dirty(eb);
- set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
-}
-
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -1633,7 +1731,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(&inode->vfs_inode))
+ if (!inode || !is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
@@ -1675,7 +1773,7 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
u64 logical)
{
- struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
+ struct extent_map_tree *em_tree = &ordered->inode->extent_tree;
struct extent_map *em;
ordered->disk_bytenr = logical;
@@ -1683,7 +1781,9 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
write_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, ordered->file_offset,
ordered->num_bytes);
- em->block_start = logical;
+ /* The em should be a new COW extent, thus it should not have an offset. */
+ ASSERT(em->offset == 0);
+ em->disk_bytenr = logical;
free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1694,7 +1794,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
+ split_extent_map(ordered->inode, ordered->file_offset,
ordered->num_bytes, len, logical))
return false;
@@ -1708,7 +1808,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sum;
u64 logical, len;
@@ -1752,7 +1852,7 @@ out:
* here so that we don't attempt to log the csums later.
*/
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) {
while ((sum = list_first_entry_or_null(&ordered->list,
typeof(*sum), list))) {
list_del(&sum->list);
@@ -1897,7 +1997,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int i, ret;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc, NULL, NULL, 1);
+ &mapped_length, &bioc, NULL, NULL);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;
@@ -1973,7 +2073,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *device;
u64 physical;
const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2006,6 +2106,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2081,9 +2184,10 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
const bool is_metadata = (block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret = 0;
int i;
@@ -2121,8 +2225,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group);
/* Wait for extent buffers to be written. */
if (is_metadata)
wait_eb_writebacks(block_group);
@@ -2159,27 +2262,36 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
+ down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
+
+ if (!device->bdev)
+ continue;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
- if (ret)
+ if (ret) {
+ up_read(&dev_replace->rwsem);
return ret;
+ }
if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
}
+ up_read(&dev_replace->rwsem);
if (!fully_written)
btrfs_dec_block_group_ro(block_group);
@@ -2340,12 +2452,12 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
}
-bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2358,7 +2470,6 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2412,7 +2523,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
@@ -2530,7 +2641,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
/* Release reservation for currently active block groups. */
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- struct map_lookup *map = block_group->physical_map;
+ struct btrfs_chunk_map *map = block_group->physical_map;
if (!(block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..7612e6572605 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -4,12 +4,27 @@
#define BTRFS_ZONED_H
#include <linux/types.h>
+#include <linux/atomic.h>
#include <linux/blkdev.h>
+#include <linux/blkzoned.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
+#include "fs.h"
+
+struct block_device;
+struct extent_buffer;
+struct btrfs_bio;
+struct btrfs_ordered_extent;
+struct btrfs_fs_info;
+struct btrfs_space_info;
+struct btrfs_eb_write_context;
+struct btrfs_fs_devices;
#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
@@ -38,14 +53,13 @@ struct btrfs_zoned_device_info {
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered);
#ifdef CONFIG_BLK_DEV_ZONED
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +73,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -77,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
-bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
@@ -85,11 +97,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
-{
- return 0;
-}
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
@@ -123,7 +130,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt)
{
return 0;
}
@@ -180,9 +188,6 @@ static inline int btrfs_load_block_group_zone_info(
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb) { }
-
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
@@ -237,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
-static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
return false;
}
@@ -323,8 +328,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
- /* Do not allow Host Manged zoned device */
- return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+ /* Do not allow Host Managed zoned device. */
+ return !bdev_is_zoned(bdev);
}
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e7ac4ec809a4..c9ea37fabf65 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -18,11 +18,13 @@
#include <linux/slab.h>
#include <linux/zstd.h>
#include "misc.h"
+#include "fs.h"
+#include "btrfs_inode.h"
#include "compression.h"
-#include "ctree.h"
+#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
-#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
@@ -145,7 +147,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
}
/*
- * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ * Calculate monotonic memory bounds.
*
* It is possible based on the level configurations that a higher level
* workspace uses less memory than a lower level workspace. In order to reuse
@@ -218,7 +220,8 @@ void zstd_cleanup_workspace_manager(void)
}
/*
- * zstd_find_workspace - find workspace
+ * Find workspace for given level.
+ *
* @level: compression level
*
* This iterates over the set bits in the active_map beginning at the requested
@@ -256,7 +259,8 @@ static struct list_head *zstd_find_workspace(unsigned int level)
}
/*
- * zstd_get_workspace - zstd's get_workspace
+ * Zstd get_workspace for level.
+ *
* @level: compression level
*
* If @level is 0, then any compression level can be used. Therefore, we begin
@@ -296,7 +300,8 @@ again:
}
/*
- * zstd_put_workspace - zstd put_workspace
+ * Zstd put_workspace.
+ *
* @ws: list_head for the workspace
*
* When putting back a workspace, we only need to update the LRU if we are of
@@ -370,52 +375,63 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
zstd_cstream *stream;
int ret = 0;
- int nr_pages = 0;
- struct page *in_page = NULL; /* The current page to read */
- struct page *out_page = NULL; /* The current page to write to */
+ int nr_folios = 0;
+ struct folio *in_folio = NULL; /* The current folio to read. */
+ struct folio *out_folio = NULL; /* The current folio to write to. */
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long len = *total_out;
- const unsigned long nr_dest_pages = *out_pages;
- unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ const unsigned long nr_dest_folios = *out_folios;
+ const u64 orig_end = start + len;
+ unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ unsigned int pg_off;
+ unsigned int cur_len;
zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
/* Initialize the stream */
stream = zstd_init_cstream(&params, len, workspace->mem,
workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_cstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd compression init level %d failed, root %llu inode %llu offset %llu",
+ workspace->req_level, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
/* map in the first page of input data */
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
-
+ workspace->in_buf.size = cur_len;
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -424,9 +440,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_compress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -450,17 +471,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -476,22 +497,32 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
- start += PAGE_SIZE;
- len -= PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ workspace->in_buf.src = NULL;
+ folio_put(in_folio);
+ start += cur_len;
+ len -= cur_len;
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.size = cur_len;
}
}
while (1) {
size_t ret2;
ret2 = zstd_end_stream(stream, &workspace->out_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_end_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -507,17 +538,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -531,10 +562,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = tot_in;
*total_out = tot_out;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (workspace->in_buf.src) {
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
}
@@ -542,24 +573,28 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
- unsigned long page_in_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long folio_in_index = 0;
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_debug("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -572,9 +607,13 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret2), btrfs_root_id(inode->root),
+ btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
@@ -596,14 +635,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src =
+ kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -616,80 +656,58 @@ done:
}
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
zstd_dstream *stream;
int ret = 0;
- size_t ret2;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
+ unsigned long to_copy = 0;
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(dest_folio));
ret = -EIO;
goto finish;
}
- destlen = min_t(size_t, destlen, PAGE_SIZE);
-
workspace->in_buf.src = data_in;
workspace->in_buf.pos = 0;
workspace->in_buf.size = srclen;
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
-
- ret2 = 1;
- while (pg_offset < destlen
- && workspace->in_buf.pos < workspace->in_buf.size) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- /* Check if the frame is over and we still need more input */
- if (ret2 == 0) {
- pr_debug("BTRFS: zstd_decompress_stream ended early\n");
- ret = -EIO;
- goto finish;
- }
- ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
- &workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
- ret = -EIO;
- goto finish;
- }
-
- buf_start = total_out;
- total_out += workspace->out_buf.pos;
- workspace->out_buf.pos = 0;
-
- if (total_out <= start_byte)
- continue;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min_t(unsigned long, destlen - pg_offset,
- workspace->out_buf.size - buf_offset);
-
- memcpy_to_page(dest_page, pg_offset,
- workspace->out_buf.dst + buf_offset, bytes);
-
- pg_offset += bytes;
+ workspace->out_buf.size = sectorsize;
+
+ /*
+ * Since both input and output buffers should not exceed one sector,
+ * one call should end the decompression.
+ */
+ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
+ if (unlikely(zstd_is_error(ret))) {
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret), btrfs_root_id(inode->root),
+ btrfs_ino(inode), folio_pos(dest_folio));
+ goto finish;
}
- ret = 0;
+ to_copy = workspace->out_buf.pos;
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy);
finish:
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
+ /* Error or early end. */
+ if (unlikely(to_copy < destlen)) {
+ ret = -EIO;
+ folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
}
return ret;
}