diff options
Diffstat (limited to 'fs/btrfs')
47 files changed, 3065 insertions, 2216 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 68ebe188446a..78556447e1d5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node) } /* - * We maintain three seperate rbtrees: one for direct refs, one for + * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not * have a key. Each tree does merge on insertion. * @@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* - * Now it's a direct ref, put it in the the direct tree. We must + * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); @@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(slot); @@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item_size = btrfs_item_size_nr(eb, slot); @@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); offset++; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 97d91e55b70a..6f5d07415dab 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_ORDERED_DATA_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline bool is_data_inode(struct inode *inode) +{ + return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} + static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 2e43fba44035..b0c8094528d1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data( void *dstv, u32 offset, size_t len) { size_t cur; - size_t offset_in_page; + size_t pgoff; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(block_ctx->start); unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); + pgoff = offset_in_page(start_offset + offset); while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + cur = min(len, ((size_t)PAGE_SIZE - pgoff)); BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + offset_in_page, cur); + memcpy(dst, kaddr + pgoff, cur); dst += cur; len -= cur; - offset_in_page = 0; + pgoff = 0; i++; } } @@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; @@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) + if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -1778,7 +1778,7 @@ again: return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); + BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * write operations. Therefore it keeps the linkage * information for a block until a block is * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This + * and even circular linkage information. This * causes no harm unless such blocks are referenced * by the most recent super block. */ @@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->nodesize)) { pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->nodesize, PAGE_SIZE); return -1; } - if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->sectorsize)) { pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->sectorsize, PAGE_SIZE); return -1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2955a4ea2fa8..548057630b69 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode, */ static void end_compressed_bio_write(struct bio *bio) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - tree = &BTRFS_I(inode)->io_tree; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - tree->ops->writepage_end_io_hook(cb->compressed_pages[0], - cb->start, - cb->start + cb->len - 1, - NULL, - bio->bi_status ? - BLK_STS_OK : BLK_STS_NOTSUPP); + btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + cb->start, cb->start + cb->len - 1, + bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_SIZE - 1)); + WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; @@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, + 0); page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; @@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, - comp_bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, + comp_bio, 0); page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* * Shannon Entropy calculation * - * Pure byte distribution analysis fails to determine compressiability of data. + * Pure byte distribution analysis fails to determine compressibility of data. * Try calculating entropy to estimate the average minimum number of bits * needed to encode the sampled data. * @@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) { /* * Use 4 bits as radix base - * Use 16 u32 counters for calculating new possition in buf array + * Use 16 u32 counters for calculating new position in buf array * * @array - array that will be sorted * @array_buf - buffer array to store sorting results diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 539901fb5165..d92462fe66c8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "print-tree.h" #include "locking.h" +#include "volumes.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * * What is forced COW: * when we create snapshot during committing the transaction, - * after we've finished coping src root, we must COW the shared + * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ if (btrfs_header_generation(buf) == trans->transid && @@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; + if (test_bit(BTRFS_ROOT_DELETING, &root->state)) + btrfs_err(fs_info, + "COW'ing blocks on a fs root that's being dropped"); + if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", trans->transid, @@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); @@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete * it, therefore use right neighbor to insert the new item and - * no need to touch/dirty our left leaft. */ + * no need to touch/dirty our left leaf. */ btrfs_tree_unlock(left); free_extent_buffer(left); path->nodes[0] = right; @@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; @@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); if (left_level == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68f322f600a0..f031a447a047 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * File system states + * Runtime (in-memory) states of filesystem */ -#define BTRFS_FS_STATE_ERROR 0 -#define BTRFS_FS_STATE_REMOUNTING 1 -#define BTRFS_FS_STATE_TRANS_ABORTED 2 -#define BTRFS_FS_STATE_DEV_REPLACING 3 -#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, +}; #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -195,9 +208,10 @@ struct btrfs_root_backup { * it currently lacks any block count etc etc */ struct btrfs_super_block { - u8 csum[BTRFS_CSUM_SIZE]; /* the first 4 fields must match struct btrfs_header */ - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + u8 fsid[BTRFS_FSID_SIZE]; __le64 bytenr; /* this block number */ __le64 flags; @@ -234,8 +248,11 @@ struct btrfs_super_block { __le64 cache_generation; __le64 uuid_tree_generation; + /* the UUID written into btree blocks */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* future expansion */ - __le64 reserved[30]; + __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -265,7 +282,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES) + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -316,7 +334,7 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; +enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -360,9 +378,7 @@ struct btrfs_dev_replace { struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; - rwlock_t lock; - atomic_t blocking_readers; - wait_queue_head_t read_lock_wq; + struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; @@ -443,13 +459,19 @@ struct btrfs_space_info { struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; -#define BTRFS_BLOCK_RSV_GLOBAL 1 -#define BTRFS_BLOCK_RSV_DELALLOC 2 -#define BTRFS_BLOCK_RSV_TRANS 3 -#define BTRFS_BLOCK_RSV_CHUNK 4 -#define BTRFS_BLOCK_RSV_DELOPS 5 -#define BTRFS_BLOCK_RSV_EMPTY 6 -#define BTRFS_BLOCK_RSV_TEMP 7 +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; struct btrfs_block_rsv { u64 size; @@ -509,18 +531,18 @@ struct btrfs_free_cluster { }; enum btrfs_caching_type { - BTRFS_CACHE_NO = 0, - BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FAST = 2, - BTRFS_CACHE_FINISHED = 3, - BTRFS_CACHE_ERROR = 4, + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FAST, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, }; enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN = 0, - BTRFS_DC_ERROR = 1, - BTRFS_DC_CLEAR = 2, - BTRFS_DC_SETUP = 3, + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, }; struct btrfs_caching_control { @@ -712,41 +734,61 @@ struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; -#define BTRFS_FS_BARRIER 1 -#define BTRFS_FS_CLOSING_START 2 -#define BTRFS_FS_CLOSING_DONE 3 -#define BTRFS_FS_LOG_RECOVERING 4 -#define BTRFS_FS_OPEN 5 -#define BTRFS_FS_QUOTA_ENABLED 6 -#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 -#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 -#define BTRFS_FS_BTREE_ERR 11 -#define BTRFS_FS_LOG1_ERR 12 -#define BTRFS_FS_LOG2_ERR 13 -#define BTRFS_FS_QUOTA_OVERRIDE 14 -/* Used to record internally whether fs has been frozen */ -#define BTRFS_FS_FROZEN 15 - -/* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ -#define BTRFS_FS_EXCL_OP 16 - /* - * To info transaction_kthread we need an immediate commit so it doesn't - * need to wait for commit_interval + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. */ -#define BTRFS_FS_NEED_ASYNC_COMMIT 17 +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, + * ptr points to a struct btrfs_device. + */ + bool is_block_group; +}; -/* - * Indicate that balance has been set up from the ioctl and is in the main - * phase. The fs_info::balance_ctl is initialized. - */ -#define BTRFS_FS_BALANCE_RUNNING 18 +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + +enum { + BTRFS_FS_BARRIER, + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ + BTRFS_FS_EXCL_OP, + /* + * To info transaction_kthread we need an immediate commit so it + * doesn't need to wait for commit_interval + */ + BTRFS_FS_NEED_ASYNC_COMMIT, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, +}; struct btrfs_fs_info { - u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; @@ -790,6 +832,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; + /* block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -1114,6 +1158,10 @@ struct btrfs_fs_info { u32 sectorsize; u32 stripesize; + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1133,22 +1181,24 @@ struct btrfs_subvolume_writers { /* * The state of btrfs root */ -/* - * btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So IN_TRANS_SETUP - * is used to tell us when more checks are required - */ -#define BTRFS_ROOT_IN_TRANS_SETUP 0 -#define BTRFS_ROOT_REF_COWS 1 -#define BTRFS_ROOT_TRACK_DIRTY 2 -#define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 -#define BTRFS_ROOT_DEFRAG_RUNNING 5 -#define BTRFS_ROOT_FORCE_COW 6 -#define BTRFS_ROOT_MULTI_LOG_TASKS 7 -#define BTRFS_ROOT_DIRTY 8 +enum { + /* + * btrfs_record_root_in_trans is a multi-step process, and it can race + * with the balancing code. But the race is very small, and only the + * first time the root is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ + BTRFS_ROOT_IN_TRANS_SETUP, + BTRFS_ROOT_REF_COWS, + BTRFS_ROOT_TRACK_DIRTY, + BTRFS_ROOT_IN_RADIX, + BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + BTRFS_ROOT_DEFRAG_RUNNING, + BTRFS_ROOT_FORCE_COW, + BTRFS_ROOT_MULTI_LOG_TASKS, + BTRFS_ROOT_DIRTY, + BTRFS_ROOT_DELETING, +}; /* * in ram representation of the tree. extent_root is used for all allocations @@ -1274,6 +1324,9 @@ struct btrfs_root { u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + /* Number of active swapfiles */ + atomic_t nr_swapfiles; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2570,10 +2623,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID = 0, - BTRFS_REF_TYPE_BLOCK = 1, - BTRFS_REF_TYPE_DATA = 2, - BTRFS_REF_TYPE_ANY = 3, + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2599,7 +2652,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2713,10 +2766,12 @@ enum btrfs_reserve_flush_enum { enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + ALLOC_CHUNK = 7, + COMMIT_TRANS = 8, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2767,6 +2822,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3141,7 +3203,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, @@ -3150,9 +3212,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, unsigned *bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3189,6 +3258,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3428,6 +3503,16 @@ static inline void assfail(const char *expr, const char *file, int line) #define ASSERT(expr) ((void)0) #endif +/* + * Use that for functions that are conditionally exported for sanity tests but + * otherwise static + */ +#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define EXPORT_FOR_TESTS static +#else +#define EXPORT_FOR_TESTS +#endif + __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9301b3ad9217..cad36c99a483 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -400,6 +398,20 @@ again: return head; } +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + lockdep_assert_held(&head->lock); + + rb_erase_cached(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + atomic_dec(&delayed_refs->num_entries); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -453,7 +465,6 @@ inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } @@ -462,12 +473,14 @@ inserted: * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * versa we need to make sure to adjust pending_csums accordingly. */ if (existing->is_data) { - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + u64 csum_leaves = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_leaves; + } } spin_unlock(&existing->lock); } @@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, && head_ref->qgroup_reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, existing, head_ref, + update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } else { if (old_ref_mod) *old_ref_mod = 0; - if (head_ref->is_data && head_ref->ref_mod < 0) + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 8e20c5cb5404..d2af974f68a1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } - +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..8750c835f535 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found: BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; - dev_replace->replace_state = 0; dev_replace->time_started = 0; dev_replace->time_stopped = 0; atomic64_set(&dev_replace->num_write_errors, 0); @@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_mark_buffer_dirty(eb); @@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { @@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(src_device)) return PTR_ERR(src_device); + if (btrfs_pinned_by_swapfile(fs_info, src_device)) { + btrfs_warn_in_rcu(fs_info, + "cannot replace device %s (devid %llu) due to active swapfile", + btrfs_dev_name(src_device), src_device->devid); + return -ETXTBSY; + } + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); if (ret) @@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, } need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) { ret = PTR_ERR(trans); need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; @@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, leave: if (need_unlock) - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } @@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); /* * flush all outstanding I/O and inode extent mappings before the @@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); @@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_write_unlock(dev_replace); - + up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_remove_srcdev(src_device); @@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) @@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + up_write(&dev_replace->rwsem); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_write(&dev_replace->rwsem); + ret = btrfs_scrub_cancel(fs_info); + if (ret < 0) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* + * btrfs_dev_replace_finishing() will handle the + * cleanup part + */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + up_write(&dev_replace->rwsem); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + /* Scrub for replace must not be running in suspended state */ + ret = btrfs_scrub_cancel(fs_info); + ASSERT(ret != -ENOTCONN); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } @@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_write_unlock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); /* * This could collide with a paused balance, but the exclusive op logic @@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; @@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; @@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) * something that can happen if the dev_replace * procedure is suspended by an umount and then * the tgtdev is missing (or "btrfs dev scan") was - * not called and the the filesystem is remounted + * not called and the filesystem is remounted * in degraded state. This does not stop the * dev_replace procedure. It needs to be canceled * manually if the cancellation is wanted. @@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) -{ - read_lock(&dev_replace->lock); -} - -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) -{ - read_unlock(&dev_replace->lock); -} - -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) -{ -again: - wait_event(dev_replace->read_lock_wq, - atomic_read(&dev_replace->blocking_readers) == 0); - write_lock(&dev_replace->lock); - if (atomic_read(&dev_replace->blocking_readers)) { - write_unlock(&dev_replace->lock); - goto again; - } -} - -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) -{ - write_unlock(&dev_replace->lock); -} - -/* inc blocking cnt and release read lock */ -void btrfs_dev_replace_set_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - atomic_inc(&dev_replace->blocking_readers); - read_unlock(&dev_replace->lock); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { percpu_counter_inc(&fs_info->dev_replace.bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..4aa40bacc6cc 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f0b6d1936e8..8da2f380d3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len = buf->len - offset; while (len > 0) { + /* + * Note: we don't need to check for the err == 1 case here, as + * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' + * and 'min_len = 32' and the currently implemented mapping + * algorithm we cannot cross a page boundary. + */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) @@ -477,9 +483,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, int mirror_num = 0; int failed_mirror = 0; - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { @@ -493,15 +499,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, break; } - /* - * This buffer's crc is fine, but its contents are corrupted, so - * there is no reason to read the other copies, they won't be - * any less wrong. - */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || - ret == -EUCLEAN) - break; - num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) @@ -551,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); return csum_tree_block(fs_info, eb, 0); @@ -566,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { - if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + u8 *metadata_uuid; + + /* + * Checking the incompat flag is only valid for the current + * fs. For seed devices it's forbidden to have their uuid + * changed so reading ->fsid in this case is fine + */ + if (fs_devices == fs_info->fs_devices && + btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; + + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { ret = 0; break; } @@ -669,19 +679,6 @@ out: return ret; } -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ - struct extent_buffer *eb; - - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = failed_mirror; - atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, -EIO); - return -EIO; /* we fixed nothing */ -} - static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; @@ -760,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work) async->status = ret; } +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; + struct inode *inode; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); + inode = async->private_data; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -773,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work) return; } - btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, + async->mirror_num, 1); + if (ret) { + async->bio->bi_status = ret; + bio_endio(async->bio); + } } static void run_one_async_free(struct btrfs_work *work) @@ -1187,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); + atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -2127,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.blocking_readers, 0); + init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); - init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2451,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } @@ -2665,6 +2678,9 @@ int open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); @@ -2754,6 +2770,9 @@ int open_ctree(struct super_block *sb, fs_info->sectorsize = 4096; fs_info->stripesize = 4096; + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2790,11 +2809,29 @@ int open_ctree(struct super_block *sb, * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - memcpy(fs_info->super_for_commit, fs_info->super_copy, - sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + disk_super = fs_info->super_copy; + + ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)); + + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { + ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, + BTRFS_FSID_SIZE)); + } + + features = btrfs_super_flags(disk_super); + if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; + btrfs_set_super_flags(disk_super, features); + btrfs_info(fs_info, + "found metadata UUID change in progress flag, clearing"); + } + + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); ret = btrfs_validate_mount_super(fs_info); if (ret) { @@ -2803,7 +2840,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2915,7 +2951,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); + memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3064,7 +3100,7 @@ retry_root_backup: if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "writeable mount is not allowed due to too many missing devices"); + "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; } @@ -3733,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -4040,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests - * enabled. Normal people shouldn't be using umapped buffers as dirty + * enabled. Normal people shouldn't be using unmapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) @@ -4338,6 +4375,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + struct extent_state *cached_state = NULL; + /* * The btrfs_finish_extent_commit() may get the same range as * ours between find_first_extent_bit and clear_extent_dirty. @@ -4346,13 +4385,14 @@ again: */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); + free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4409,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4514,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, - - /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..987a64bc0c66 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,11 +21,11 @@ #define BTRFS_BDEV_BLOCKSIZE (4096) enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA = 0, - BTRFS_WQ_ENDIO_METADATA = 1, - BTRFS_WQ_ENDIO_FREE_SPACE = 2, - BTRFS_WQ_ENDIO_RAID56 = 3, - BTRFS_WQ_ENDIO_DIO_REPAIR = 4, + BTRFS_WQ_ENDIO_DATA, + BTRFS_WQ_ENDIO_METADATA, + BTRFS_WQ_ENDIO_FREE_SPACE, + BTRFS_WQ_ENDIO_RAID56, + BTRFS_WQ_ENDIO_DIO_REPAIR, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..b15afeae16df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -51,6 +51,24 @@ enum { CHUNK_ALLOC_FORCE = 2, }; +/* + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void update_##name(struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -1037,7 +1055,7 @@ out_free: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref btrfs_delayed_ref_unlock(head); } -static int cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; - int ret; if (!extent_op) - return 0; - head->extent_op = NULL; + return NULL; + if (head->must_insert_reserved) { + head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); - return 0; + return NULL; } + return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } +static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + int nr_items = 1; /* Dropping this ref head update. */ + + if (head->total_ref_mod < 0) { + struct btrfs_space_info *space_info; + u64 flags; + + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + + /* + * We had csum deletions accounted for in our delayed refs rsv, + * we need to drop the csum leaves for this update from our + * delayed_refs_rsv. + */ + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} + static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { @@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; - ret = cleanup_extent_op(trans, head); + ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { unselect_delayed_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - delayed_refs->num_heads--; - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); - atomic_dec(&delayed_refs->num_entries); - - trace_run_delayed_ref_head(fs_info, head, 0); - - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - } - } if (head->must_insert_reserved) { btrfs_pin_extent(fs_info, head->bytenr, @@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); + cleanup_ref_head_accounting(trans, head); + + trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return 0; @@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *global_rsv; - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; - u64 num_bytes, num_dirty_bgs_bytes; - int ret = 0; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - num_heads = heads_to_leaves(fs_info, num_heads); - if (num_heads > 1) - num_bytes += (num_heads - 1) * fs_info->nodesize; - num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * - fs_info->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, - num_dirty_bgs); - global_rsv = &fs_info->global_block_rsv; + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); /* - * If we can't allocate any more chunks lets make sure we have _lots_ of - * wiggle room since running delayed refs can create more delayed refs. + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. */ - if (global_rsv->space_info->full) { - num_dirty_bgs_bytes <<= 1; - num_bytes <<= 1; - } - - spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) - ret = 1; - spin_unlock(&global_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); return ret; } @@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans); + return btrfs_check_space_for_delayed_refs(trans->fs_info); } struct async_delayed_refs { @@ -3588,6 +3623,8 @@ again: */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { + bool drop_reserve = true; + cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); @@ -3660,6 +3697,7 @@ again: list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); + drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { @@ -3667,9 +3705,11 @@ again: } } - /* if its not on the io list, we need to put the block group */ + /* if it's not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); if (ret) break; @@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4256,7 +4297,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; + update_bytes_may_use(data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - if (WARN_ON(data_sinfo->bytes_may_use < len)) - data_sinfo->bytes_may_use = 0; - else - data_sinfo->bytes_may_use -= len; + update_bytes_may_use(data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. For raid56, the space info used + * space is actually usable. For raid56, the space info used * doesn't include the parity drive, so we don't have to * change the math */ @@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *trans; - u64 bytes; + u64 bytes_needed; + u64 reclaim_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) @@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes = (ticket) ? ticket->bytes : 0; + bytes_needed = (ticket) ? ticket->bytes : 0; spin_unlock(&space_info->lock); - if (!bytes) + if (!bytes_needed) return 0; /* See if there is enough pinned space to make this reservation */ if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; @@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size > bytes) - bytes = 0; - else - bytes -= delayed_rsv->size; + reclaim_bytes += delayed_rsv->reserved; spin_unlock(&delayed_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { return -ENOSPC; } @@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, list_del_init(&ticket->list); if (ticket->bytes && ticket->bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket->bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); } @@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); @@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, + to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -5407,7 +5549,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5435,7 +5577,7 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - space_info->bytes_may_use += ticket->bytes; + update_bytes_may_use(space_info, ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5443,7 +5585,7 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - space_info->bytes_may_use += num_bytes; + update_bytes_may_use(space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. - * @flush - the flusing restriction. + * @flush - the flushing restriction. * * Essentially the same as btrfs_block_rsv_refill, except it uses the * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the resreve + * or return if we already have enough space. This will also handle the reserve * tracepoint for the reserved amount. */ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, @@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = delayed_rsv; + + if (target->full || target == block_rsv) + target = global_rsv; + + if (block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; @@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, - &qgroup_to_release); + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); @@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; - if (global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, + num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; + update_bytes_may_use(sinfo, num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_may_use -= num_bytes; + update_bytes_may_use(sinfo, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; if (fs_info->quota_root) @@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} /* * To be called after all the new block groups attached to the transaction @@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; int factor; + int ret = 0; /* block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; + if (!cache) { + ret = -ENOENT; + break; + } factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); trans->transaction->num_dirty_bgs++; + trans->delayed_ref_updates++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, total -= num_bytes; bytenr += num_bytes; } - return 0; + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - space_info->bytes_may_use -= ram_bytes; + update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - space_info->bytes_pinned -= len; + update_bytes_pinned(space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - space_info->bytes_may_use += to_add; + update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) unpin = &fs_info->freed_extents[0]; while (!trans->aborted) { + struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; @@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); unpin_extent_range(fs_info, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); cond_resched(); } @@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - btrfs_free_delayed_extent_op(head->extent_op); - head->extent_op = NULL; - } + if (cleanup_extent_op(head) != NULL) + goto out; /* * waiting for the lock here would deadlock. If someone else has it @@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); - - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; + btrfs_delete_ref_head(delayed_refs, head); head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; + cleanup_ref_head_accounting(trans, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, } /* + * Structure used internally for find_free_extent() function. Wraps needed + * parameters. + */ +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; +}; + + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group_cache **cluster_bg_ret) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_block_group_cache *cluster_bg; + u64 aligned_cluster; + u64 offset; + int ret; + + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->key.objectid, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, + ffe_ctl->search_start, ffe_ctl->num_bytes); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } + + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } + + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, + ffe_ctl->search_start, ffe_ctl->num_bytes, + aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(bg, + ffe_ctl->search_start, + ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; + return 0; + } + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_clustered) { + spin_unlock(&last_ptr->refill_lock); + + ffe_ctl->retry_clustered = true; + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + ffe_ctl->empty_size); + return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group + */ +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl) +{ + u64 offset; + + /* + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; + + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); + return 1; + } + spin_unlock(&free_space_ctl->tree_lock); + } + + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); + + /* + * If we didn't find a chunk, and we haven't failed on this block group + * before, and this block group is in the middle of caching and we are + * ok with waiting, then go ahead and wait for progress to be made, and + * set @retry_unclustered to true. + * + * If @retry_unclustered is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && + ffe_ctl->loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); + ffe_ctl->retry_unclustered = true; + return -EAGAIN; + } else if (!offset) { + return 1; + } + ffe_ctl->found_offset = offset; + return 0; +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_free_cluster *last_ptr, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + int full_search, bool use_cluster) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret; + + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && + ffe_ctl->have_caching_bg) + return 1; + + if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) + return 1; + + if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } + return 0; + } + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any uncached bgs and we've already done a + * full search through. + */ + if (ffe_ctl->orig_have_caching_bg || !full_search) + ffe_ctl->loop = LOOP_CACHING_WAIT; + else + ffe_ctl->loop = LOOP_ALLOC_CHUNK; + } else { + ffe_ctl->loop++; + } + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = do_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + + /* Do not bail out on ENOSPC since we can do more. */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; +} + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position @@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * * If there is no suitable free space, we will record the max size of * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, @@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - u64 search_start = 0; - u64 max_extent_size = 0; - u64 max_free_space = 0; - u64 empty_cluster = 0; + struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - int loop = 0; - int index = btrfs_bg_flags_to_raid_index(flags); - bool failed_cluster_refill = false; - bool failed_alloc = false; bool use_cluster = true; - bool have_caching_bg = false; - bool orig_have_caching_bg = false; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.search_start = 0; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.delalloc = delalloc; + ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); + ffe_ctl.have_caching_bg = false; + ffe_ctl.orig_have_caching_bg = false; + ffe_ctl.found_offset = 0; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); + last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) @@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&last_ptr->lock); } - search_start = max(search_start, first_logical_byte(fs_info, 0)); - search_start = max(search_start, hint_byte); - if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(fs_info, search_start); + ffe_ctl.search_start = max(ffe_ctl.search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); + if (ffe_ctl.search_start == hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl.search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = btrfs_bg_flags_to_raid_index( + ffe_ctl.index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; @@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } } search: - have_caching_bg = false; - if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) + ffe_ctl.have_caching_bg = false; + if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || + ffe_ctl.index == 0) full_search = true; down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups[index], - list) { - u64 offset; - int cached; - + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl.index], list) { /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; btrfs_grab_block_group(block_group, delalloc); - search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->key.objectid; /* * this can happen if we end up cycling through all the @@ -7398,9 +7953,9 @@ search: } have_block_group: - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { - have_caching_bg = true; + ffe_ctl.cached = block_group_cache_done(block_group); + if (unlikely(!ffe_ctl.cached)) { + ffe_ctl.have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -7414,322 +7969,92 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *used_block_group; - unsigned long aligned_cluster; - /* - * the refill lock keeps out other - * people trying to start a new cluster - */ - used_block_group = btrfs_lock_cluster(block_group, - last_ptr, - delalloc); - if (!used_block_group) - goto refill_cluster; - - if (used_block_group != block_group && - (used_block_group->ro || - !block_group_bits(used_block_group, flags))) - goto release_cluster; - - offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, - num_bytes, - used_block_group->key.objectid, - &max_extent_size); - if (offset) { - /* we have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - used_block_group, - search_start, num_bytes); - if (used_block_group != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = used_block_group; - } - goto checks; - } - - WARN_ON(last_ptr->block_group != used_block_group); -release_cluster: - /* If we are on LOOP_NO_EMPTY_SIZE, we can't - * set up a new clusters, so lets just skip it - * and let the allocator find whatever block - * it can find. If we reach this point, we - * will have tried the cluster allocator - * plenty of times and not have found - * anything, so we are likely way too - * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - used_block_group != block_group) { - spin_unlock(&last_ptr->refill_lock); - btrfs_release_block_group(used_block_group, - delalloc); - goto unclustered_alloc; - } + struct btrfs_block_group_cache *cluster_bg = NULL; - /* - * this cluster didn't work out, free it and - * start over - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - - if (used_block_group != block_group) - btrfs_release_block_group(used_block_group, - delalloc); -refill_cluster: - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - aligned_cluster = max_t(unsigned long, - empty_cluster + empty_size, - block_group->full_stripe_len); + ret = find_free_extent_clustered(block_group, last_ptr, + &ffe_ctl, &cluster_bg); - /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(fs_info, block_group, - last_ptr, search_start, - num_bytes, - aligned_cluster); if (ret == 0) { - /* - * now pull our allocation out of this - * cluster - */ - offset = btrfs_alloc_from_cluster(block_group, - last_ptr, - num_bytes, - search_start, - &max_extent_size); - if (offset) { - /* we found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - block_group, search_start, - num_bytes); - goto checks; + if (cluster_bg && cluster_bg != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = cluster_bg; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT - && !failed_cluster_refill) { - spin_unlock(&last_ptr->refill_lock); - - failed_cluster_refill = true; - wait_block_group_cache_progress(block_group, - num_bytes + empty_cluster + empty_size); + goto checks; + } else if (ret == -EAGAIN) { goto have_block_group; - } - - /* - * at this point we either didn't find a cluster - * or we weren't able to allocate a block from our - * cluster. Free the cluster we've been trying - * to use, and go to the next block group - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } - -unclustered_alloc: - /* - * We are doing an unclustered alloc, set the fragmented flag so - * we don't bother trying to setup a cluster again until we get - * more space. - */ - if (unlikely(last_ptr)) { - spin_lock(&last_ptr->lock); - last_ptr->fragmented = 1; - spin_unlock(&last_ptr->lock); - } - if (cached) { - struct btrfs_free_space_ctl *ctl = - block_group->free_space_ctl; - - spin_lock(&ctl->tree_lock); - if (ctl->free_space < - num_bytes + empty_cluster + empty_size) { - max_free_space = max(max_free_space, - ctl->free_space); - spin_unlock(&ctl->tree_lock); + } else if (ret > 0) { goto loop; } - spin_unlock(&ctl->tree_lock); + /* ret == -ENOENT case falls through */ } - offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size, - &max_extent_size); - /* - * If we didn't find a chunk, and we haven't failed on this - * block group before, and this block group is in the middle of - * caching and we are ok with waiting, then go ahead and wait - * for progress to be made, and set failed_alloc to true. - * - * If failed_alloc is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !failed_alloc && !cached && - loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(block_group, - num_bytes + empty_size); - failed_alloc = true; + ret = find_free_extent_unclustered(block_group, last_ptr, + &ffe_ctl); + if (ret == -EAGAIN) goto have_block_group; - } else if (!offset) { + else if (ret > 0) goto loop; - } + /* ret == 0 case falls through */ checks: - search_start = round_up(offset, fs_info->stripesize); + ffe_ctl.search_start = round_up(ffe_ctl.found_offset, + fs_info->stripesize); /* move on to the next group */ - if (search_start + num_bytes > + if (ffe_ctl.search_start + num_bytes > block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } - if (offset < search_start) - btrfs_add_free_space(block_group, offset, - search_start - offset); + if (ffe_ctl.found_offset < ffe_ctl.search_start) + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = search_start; + ins->objectid = ffe_ctl.search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(block_group, search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, + num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: - failed_cluster_refill = false; - failed_alloc = false; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - index); + ffe_ctl.index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg - && !orig_have_caching_bg) - orig_have_caching_bg = true; - - if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) - goto search; - - if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, + full_search, use_cluster); + if (ret > 0) goto search; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { - index = 0; - if (loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (orig_have_caching_bg || !full_search) - loop = LOOP_CACHING_WAIT; - else - loop = LOOP_ALLOC_CHUNK; - } else { - loop++; - } - - if (loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; - - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); - - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - loop = LOOP_NO_EMPTY_SIZE; - - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - goto out; - } - - if (loop == LOOP_NO_EMPTY_SIZE) { - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (empty_size == 0 && - empty_cluster == 0) { - ret = -ENOSPC; - goto out; - } - empty_size = 0; - empty_cluster = 0; - } - - goto search; - } else if (!ins->objectid) { - ret = -ENOSPC; - } else if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } - ret = 0; - } -out: if (ret == -ENOSPC) { - if (!max_extent_size) - max_extent_size = max_free_space; + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl.max_extent_size) + ffe_ctl.max_extent_size = ffe_ctl.total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = max_extent_size; + space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); - ins->offset = max_extent_size; + ins->offset = ffe_ctl.max_extent_size; } return ret; } @@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); - write_extent_buffer_fsid(buf, fs_info->fsid); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different - * EXENT bit to differentiate dirty pages. + * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, @@ -8221,7 +8546,12 @@ again: goto again; } - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; - u32 blocksize; struct btrfs_key key; struct btrfs_key first_key; struct extent_buffer *next; @@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); btrfs_node_key_to_cpu(path->nodes[level], &first_key, path->slots[level]); - blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); if (!next) { @@ -8693,7 +9021,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; + /* + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. + */ + set_bit(BTRFS_ROOT_DELETING, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) } /* - * checks to see if its even possible to relocate this block group. + * Checks to see if it's even possible to relocate this block group. * * @return - -1 if it's not a good idea to relocate this block group, 0 if its * ok to go ahead and try. @@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _alot_ of + * the space in and be done with it. This saves us _a_lot_ of * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { @@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: + btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); } btrfs_trans_release_chunk_metadata(trans); @@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); set_avail_alloc_bits(fs_info, type); return 0; @@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; + bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); @@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_lock(&trans->transaction->cache_write_mutex); /* - * make sure our free spache cache IO is done before remove the + * Make sure our free space cache IO is done before removing the * free space inode */ spin_lock(&trans->transaction->dirty_bgs_lock); @@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); + remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); return ret; } @@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - space_info->bytes_pinned -= block_group->pinned; + update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!blk_queue_discard(bdev_get_queue(device->bdev))) return 0; - /* Not writeable = nothing to do. */ + /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d228f706ff3e..fc126b92ea59 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - if (tree->ops && tree->ops->check_extent_io_range) - tree->ops->check_extent_io_range(tree->private_data, caller, - start, end); + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode || !is_data_inode(inode)) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return tree_search_for_insert(tree, offset, NULL, NULL); } -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->private_data, new, other); -} - /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree, } } -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->private_data, state, bits); -} - static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits, struct extent_changeset *changeset); @@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->private_data, orig, split); -} - /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, { struct rb_node *node; - split_cb(tree, orig, split); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; } - clear_state_cb(tree, state, bits); + + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); state->state &= ~bits_to_clear; @@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree, unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; - set_state_cb(tree, state, bits); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; @@ -1459,16 +1452,16 @@ out: * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. @Start and @end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + * false if nothing was in the tree */ -static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1585,7 +1580,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1605,6 +1600,7 @@ again: /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1616,11 +1612,10 @@ again: loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1643,17 +1638,6 @@ out_failed: return found; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) -{ - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); -} -#endif - static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * This is a generic handler for readpage errors. If other copies exist, read + * those and write back good data to the failed position. Does not investigate + * in remapping the failed extent elsewhere, hoping the device will be smart + * enough to do this as needed */ - static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int failed_mirror) @@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); - struct extent_io_tree *tree; int ret = 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end, NULL, - uptodate); + btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + bool data_inode = btrfs_ino(BTRFS_I(inode)) + != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops)) { + if (likely(uptodate)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); @@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (tree->ops) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (ret == -EAGAIN) { - /* - * Data inode's readpage_io_failed_hook() always - * returns -EAGAIN. - * - * The generic bio_readpage_error handles errors - * the following way: If possible, new read - * requests are created and submitted and will - * end up in end_bio_extent_readpage as well (if - * we're lucky, not in the !uptodate case). In - * that case it returns 0 and we just go on with - * the next page in our bio. If it can't handle - * the error it will return -EIO and we remain - * responsible for that page. - */ - ret = bio_readpage_error(bio, offset, page, - start, end, mirror); - if (ret == 0) { - uptodate = !bio->bi_status; - offset += len; - continue; - } - } + if (data_inode) { /* - * metadata's readpage_io_failed_hook() always returns - * -EIO and fixes nothing. -EIO is also returned if - * data inode error could not be fixed. + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, + * not in the !uptodate case). In that case it returns + * 0 and we just go on with the next page in our bio. + * If it can't handle the error it will return -EIO and + * we remain responsible for that page. */ - ASSERT(ret == -EIO); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); + if (ret == 0) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } else { + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, + &eb->bflags)) + btree_readahead_hook(eb, -EIO); + + ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -2607,7 +2585,7 @@ readpage_ok: unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_SIZE-1); + off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); @@ -2644,8 +2622,7 @@ readpage_ok: if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, else contig = bio_end_sector(bio) == sector; - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, - bio, bio_flags)) + ASSERT(tree->ops); + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; @@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc, /* * helper for __extent_writepage, doing all of the delayed allocation setup. * - * This returns 1 if our fill_delalloc function did all the work required + * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has * been started and the page is already unlocked. * @@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct inode *inode, - struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, - u64 delalloc_start, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc, + u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; int page_started = 0; - if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) - return 0; while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + found = find_lock_delalloc_range(inode, tree, page, &delalloc_start, - &delalloc_end, - BTRFS_MAX_EXTENT_SIZE); - if (nr_delalloc == 0) { + &delalloc_end); + if (!found) { delalloc_start = delalloc_end + 1; continue; } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - nr_written, wbc); + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + delalloc_end, &page_started, nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); - /* fill_delalloc should be return < 0 for error - * but just in case, we use > 0 here meaning the - * IO is started, so we don't want to return > 0 - * unless things are going well. + /* + * btrfs_run_delalloc_range should return < 0 for error + * but just in case, we use > 0 here meaning the IO is + * started, so we don't want to return > 0 unless + * things are going well. */ ret = ret < 0 ? ret : -EIO; goto done; @@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, int nr = 0; bool compressed; - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); + ret = btrfs_writepage_cow_fixup(page, start, page_end); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); - unlock_page(page); - return 1; - } + update_nr_written(wbc, nr_written); + unlock_page(page); + return 1; } /* @@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, end = page_end; if (i_size <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); goto done; } @@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 offset; if (cur >= i_size) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, cur, + page_end, 1); break; } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, @@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, * end_io notification does not happen here for * compressed extents */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); + if (!compressed) + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, + 1); else if (compressed) { /* we don't want to end_page_writeback on * a compressed extent. this happens @@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_SIZE - 1); + pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); - ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); - if (ret == 1) - goto done_unlocked; - if (ret) - goto done; + if (!epd->extent_locked) { + ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + } ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, write_flags, &nr); @@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && @@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_SIZE - 1, - NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, + start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4118,42 +4094,36 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; unsigned long bio_flags = 0; struct page *pagepool[16]; - struct page *page; struct extent_map *em_cached = NULL; struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - page = list_entry(pages->prev, struct page, lru); + while (!list_empty(pages)) { + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { + struct page *page = list_entry(pages->prev, + struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - continue; + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + readahead_gfp_mask(mapping))) { + put_page(page); + continue; + } + + pagepool[nr++] = page; } - pagepool[nr++] = page; - if (nr < ARRAY_SIZE(pagepool)) - continue; __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); - nr = 0; + &bio_flags, &prev_em_start); } - if (nr) - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); - BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(bio, 0, bio_flags); return 0; @@ -4342,7 +4312,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, /* * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cahced one. + * fiemap extent won't overlap with cached one. * Not recoverable. * * NOTE: Physical address can overlap, due to compression @@ -4914,13 +4884,6 @@ again: check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); - /* - * We will free dummy extent buffer's if they come into - * free_extent_buffer with a ref count of 2, but if we are using this we - * want the buffers to stay in memory until we're done with them, so - * bump the ref count again. - */ - atomic_inc(&eb->refs); return eb; free_eb: btrfs_release_extent_buffer(eb); @@ -5102,7 +5065,9 @@ void free_extent_buffer(struct extent_buffer *eb) while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) @@ -5111,10 +5076,6 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) - atomic_dec(&eb->refs); - - if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5340,7 +5301,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; if (start + len > eb->len) { @@ -5350,7 +5311,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5375,14 +5336,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5413,10 +5374,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb, char **map, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_SIZE - 1); + size_t offset; char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_SHIFT; @@ -5453,14 +5414,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5509,13 +5470,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5539,13 +5500,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5584,13 +5545,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = (start_offset + dst_offset) & - (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5626,7 +5586,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5638,7 +5598,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, offset = start_offset + start + byte_offset; *page_index = offset >> PAGE_SHIFT; - *page_offset = offset & (PAGE_SIZE - 1); + *page_offset = offset_in_page(offset); } /** @@ -5780,7 +5740,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5798,10 +5758,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_offset); + src_off_in_page = offset_in_page(start_offset + src_offset); dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_SHIFT; @@ -5829,7 +5787,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5853,10 +5811,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, dst_i = (start_offset + dst_end) >> PAGE_SHIFT; src_i = (start_offset + src_end) >> PAGE_SHIFT; - dst_off_in_page = (start_offset + dst_end) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_end); + src_off_in_page = offset_in_page(start_offset + src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..9673be3f3d1f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -37,18 +37,22 @@ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_FLAG_SHIFT 16 -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ -#define EXTENT_BUFFER_UNMAPPED 9 -#define EXTENT_BUFFER_IN_TREE 10 -#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ +enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, + EXTENT_BUFFER_CORRUPT, + /* this got triggered by readahead */ + EXTENT_BUFFER_READAHEAD, + EXTENT_BUFFER_TREE_REF, + EXTENT_BUFFER_STALE, + EXTENT_BUFFER_WRITEBACK, + /* read IO error */ + EXTENT_BUFFER_READ_ERR, + EXTENT_BUFFER_UNMAPPED, + EXTENT_BUFFER_IN_TREE, + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, +}; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) @@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be allways defined, the function + * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ extent_submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - - /* - * Optional hooks, called if the pointer is not NULL - */ - int (*fill_delalloc)(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc); - - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate); - void (*set_bit_hook)(void *private_data, struct extent_state *state, - unsigned *bits); - void (*clear_bit_hook)(void *private_data, - struct extent_state *state, - unsigned *bits); - void (*merge_extent_hook)(void *private_data, - struct extent_state *new, - struct extent_state *other); - void (*split_extent_hook)(void *private_data, - struct extent_state *orig, u64 split); - void (*check_extent_io_range)(void *private_data, const char *caller, - u64 start, u64 end); }; struct extent_io_tree { @@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end) + u64 end, struct extent_state **cached) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL); + EXTENT_DO_ACCOUNTING, 0, 0, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); +bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7eea8b6e2cd3..a042a193c120 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em) return container_of(prev, struct extent_map, rb_node); } -/* helper for btfs_get_extent. Given an existing extent in the tree, +/* + * Helper for btrfs_get_extent. Given an existing extent in the tree, * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 31977ffd6190..ef05a0121652 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -11,13 +11,20 @@ #define EXTENT_MAP_INLINE ((u64)-2) #define EXTENT_MAP_DELALLOC ((u64)-1) -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ -#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ -#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ -#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ +/* bits for the extent_map::flags field */ +enum { + /* this entry not yet on disk, don't free it */ + EXTENT_FLAG_PINNED, + EXTENT_FLAG_COMPRESSED, + /* pre-allocated extent */ + EXTENT_FLAG_PREALLOC, + /* Logging this extent */ + EXTENT_FLAG_LOGGING, + /* Filling in a preallocated extent */ + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, +}; struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ba74827beb32..920bf3b4b0ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) -{ - kfree(bio->csum_allocated); -} - static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { @@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc_array(nblocks, - csum_size, GFP_NOFS); - if (!btrfs_bio->csum_allocated) { + btrfs_bio->csum = kmalloc_array(nblocks, csum_size, + GFP_NOFS); + if (!btrfs_bio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } - btrfs_bio->csum = btrfs_bio->csum_allocated; - btrfs_bio->end_io = btrfs_io_bio_endio_readpage; } else { btrfs_bio->csum = btrfs_bio->csum_inline; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a3c22e16509b..d38dc8c31533 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_SIZE - 1); + int offset = offset_in_page(pos); while (write_bytes > 0) { size_t count = min_t(size_t, @@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_SIZE - 1); + size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - @@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by settattr when we are about to truncate + * ordered_data_close is set by setattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. @@ -2089,8 +2089,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * Before we acquired the inode's lock, someone may have dirtied more + * pages in the target range. We need to make sure that writeback for + * any such pages does not start while we are logging the inode, because + * if it does, any of the following might happen when we are not doing a + * full inode sync: + * + * 1) We log an extent after its writeback finishes but before its + * checksums are added to the csum tree, leading to -EIO errors + * when attempting to read the extent after a log replay. + * + * 2) We can end up logging an extent before its writeback finishes. + * Therefore after the log replay we will have a file extent item + * pointing to an unwritten extent (and no data checksums as well). + * + * So trigger writeback for any eventual new dirty pages and then we + * wait for all ordered extents to complete below. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { + inode_unlock(inode); + goto out; + } + + /* * We have to do this here to avoid the priority inversion of waiting on - * IO of a lower priority task while holding a transaciton open. + * IO of a lower priority task while holding a transaction open. */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { @@ -2130,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit - * before we start blocking join'ers. This comment is to keep somebody + * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ @@ -2162,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); - /* - * If any of the ordered extents had an error, just return it to user - * space, so that the application knows some writes didn't succeed and - * can take proper action (retry for e.g.). Blindly committing the - * transaction in this case, would fool userspace that everything was - * successful. And we also want to make sure our log doesn't contain - * file extent items pointing to extents that weren't fully written to - - * just like in the non fast fsync path, where we check for the ordered - * operation's error flag before writing to the log tree and return -EIO - * if any of them had this flag set (btrfs_wait_ordered_range) - - * therefore we need to check for errors in the ordered operations, - * which are indicated by ctx.io_err. - */ - if (ctx.io_err) { - btrfs_end_transaction(trans); - ret = ctx.io_err; - goto out; - } - if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d6736595ec57..e5089087eaa6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -74,11 +74,11 @@ out: return ret; } -struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path, int cow) +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( + struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) { struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; @@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } } +EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -315,6 +316,7 @@ out: return ret; } +EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -487,6 +489,7 @@ out: return ret; } +EXPORT_FOR_TESTS int free_space_test_bit(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 offset) { @@ -775,6 +778,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) @@ -968,6 +972,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9ea4c6f0352f..43eb4535319d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -27,6 +27,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <linux/swap.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode, /* * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, - const u64 offset, - const u64 bytes) + struct page *locked_page, + u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* + * In case this page belongs to the delalloc range being instantiated + * then skip it, since the first page of a range is going to be + * properly cleaned up by the caller of run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + offset += PAGE_SIZE; + bytes -= PAGE_SIZE; + } + + return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_SIZE - 1); + offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); @@ -357,7 +368,7 @@ struct async_extent { struct async_cow { struct inode *inode; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; @@ -538,8 +549,7 @@ again: &total_compressed); if (!ret) { - unsigned long offset = total_compressed & - (PAGE_SIZE - 1); + unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; char *kaddr; @@ -847,14 +857,13 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages, async_cow->write_flags)) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->i_mapping; - tree->ops->writepage_end_io_hook(p, start, end, - NULL, 0); + btrfs_writepage_endio_finish_ordered(p, start, end, 0); + p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, end, NULL, 0, @@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) { struct btrfs_fs_info *fs_info; struct async_cow *async_cow; - struct btrfs_root *root; unsigned long nr_pages; async_cow = container_of(work, struct async_cow, work); - root = async_cow->root; - fs_info = root->fs_info; + fs_info = async_cow->fs_info; nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; @@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = igrab(inode); - async_cow->root = root; + async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; async_cow->write_flags = write_flags; @@ -1372,7 +1378,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -1576,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) } /* - * extent_io.c call back to do delayed allocation processing + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time. */ -static int run_delalloc_range(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; @@ -1605,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, write_flags); } if (ret) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_page, start, + end - start + 1); return ret; } -static void btrfs_split_extent_hook(void *private_data, - struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) { - struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1625,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data, u64 new_size; /* - * See the explanation in btrfs_merge_extent_hook, the same + * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; @@ -1642,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data, } /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(void *private_data, - struct extent_state *new, - struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) { - struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1755,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, } /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(void *private_data, - struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits) { - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1809,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data, } /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. */ -static void btrfs_clear_bit_hook(void *private_data, - struct extent_state *state, - unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, + struct extent_state *state, unsigned *bits) { - struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1841,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data, /* * We don't reserve metadata space for space cache inodes so we - * don't need to call dellalloc_release_metadata if there is an + * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && @@ -1880,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data, } /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio + * + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio * return error otherwise */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1932,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; - - ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; -} - -/* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. * @@ -2056,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { - WARN_ON((end & (PAGE_SIZE - 1)) == 0); + WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, extra_bits, cached_state); } @@ -2152,7 +2135,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3159,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3686,6 +3669,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * <power failure> + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -4444,31 +4442,6 @@ out: return err; } -static int truncate_space_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytes_deleted) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - /* - * This is only used to apply pressure to the enospc system, we don't - * intend to use this reservation at all. - */ - bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); - bytes_deleted *= fs_info->nodesize; - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, - bytes_deleted, 1); - trans->bytes_reserved += bytes_deleted; - } - return ret; - -} - /* * Return this if we need to call truncate_block for the last bit of the * truncate. @@ -4513,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; - bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4544,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed + * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4726,15 +4698,7 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans)) - btrfs_async_run_delayed_refs(fs_info, - trans->delayed_ref_updates * 2, - trans->transid, 0); if (be_nice) { - if (truncate_space_check(trans, root, - extent_num_bytes)) { - should_end = true; - } if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } @@ -4745,7 +4709,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle || should_end) { + should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4757,23 +4721,24 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); - if (should_throttle) { - unsigned long updates = trans->delayed_ref_updates; - if (updates) { - trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, - updates * 2); - if (ret) - break; - } - } + /* - * if we failed to refill our space rsv, bail out - * and let the transaction restart + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. */ - if (should_end) { - ret = -EAGAIN; - break; + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } } goto search_again; } else { @@ -4799,18 +4764,6 @@ out: } btrfs_free_path(path); - - if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { - unsigned long updates = trans->delayed_ref_updates; - int err; - - if (updates) { - trans->delayed_ref_updates = 0; - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) - ret = err; - } - } return ret; } @@ -5155,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the end less truncate */ + /* Disable nonlocked read DIO to avoid the endless truncate */ btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); @@ -5333,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) + if (!btrfs_check_space_for_delayed_refs(fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) return trans; /* If not, commit and try again. */ @@ -6406,14 +6359,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } @@ -6625,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6652,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; - int drop_on_err = 0; u64 objectid = 0; u64 index = 0; @@ -6678,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; } - drop_on_err = 1; /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -6699,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; d_instantiate_new(dentry, inode); - drop_on_err = 0; out_fail: btrfs_end_transaction(trans); @@ -8053,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio) dio_bio->bi_status = err; dio_end_io(dio_bio); - - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(err)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -8098,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode, return; /* * Our bio might span multiple ordered extents. In this case - * we keep goin until we have accounted the whole dio. + * we keep going until we have accounted the whole dio. */ if (ordered_offset < offset + bytes) { ordered_bytes = offset + bytes - ordered_offset; @@ -8408,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (io_bio->end_io) - io_bio->end_io(io_bio, ret); + btrfs_io_bio_free_csum(io_bio); free_ordered: /* @@ -8912,7 +8865,7 @@ again: /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) - zero_start = size & ~PAGE_MASK; + zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; @@ -9157,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9968,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9996,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10029,7 +9986,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10037,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10066,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10445,26 +10402,6 @@ out: return ret; } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ - return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, - u64 start, u64 end) -{ - struct inode *inode = private_data; - u64 isize; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { struct inode *inode = tree->private_data; @@ -10481,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } } +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) + btrfs_put_block_group(sp->ptr); + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The EXCL_OP flag makes sure they aren't running/won't run + * concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap file + * is active and moving the extents. Note that this also prevents a + * concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + */ + atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group_cache *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10523,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - - /* optional callbacks */ - .fill_delalloc = run_delalloc_range, - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, - .check_extent_io_range = btrfs_check_extent_io_range, }; /* @@ -10558,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = { .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 802a628e9f7d..fab9443f6a42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else if (fsflags & FS_COMPR_FL) { const char *comp; + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_unlock; + } + binode->flags |= BTRFS_INODE_COMPRESS; binode->flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + if (atomic_read(&root->nr_swapfiles)) { + btrfs_warn(fs_info, + "cannot snapshot subvolume with active swapfile"); + return -ETXTBSY; + } + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; @@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; @@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } inode_lock(inode); - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + } else { + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + } if (ret < 0) { inode_unlock(inode); goto out_ra; @@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } rcu_read_unlock(); - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); + memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -3191,92 +3206,6 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) -{ - struct page *page; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - - if (!PageUptodate(page)) { - int ret; - - ret = btrfs_readpage(NULL, page); - if (ret) - return ERR_PTR(ret); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - } - - return page; -} - -static int gather_extent_pages(struct inode *inode, struct page **pages, - int num_pages, u64 off) -{ - int i; - pgoff_t index = off >> PAGE_SHIFT; - - for (i = 0; i < num_pages; i++) { -again: - pages[i] = extent_same_get_page(inode, index + i); - if (IS_ERR(pages[i])) { - int err = PTR_ERR(pages[i]); - - if (err == -EAGAIN) - goto again; - pages[i] = NULL; - return err; - } - } - return 0; -} - -static int lock_extent_range(struct inode *inode, u64 off, u64 len, - bool retry_range_locking) -{ - /* - * Do any pending delalloc/csum calculations on inode, one way or - * another, and lock file content. - * The locking order is: - * - * 1) pages - * 2) range in the inode's io tree - */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(inode, - off + len - 1); - if ((!ordered || - ordered->file_offset + ordered->len <= off || - ordered->file_offset >= off + len) && - !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - if (!retry_range_locking) - return -EAGAIN; - btrfs_wait_ordered_range(inode, off, len); - } - return 0; -} - static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { inode_unlock(inode1); @@ -3292,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len, - bool retry_range_locking) -{ - int ret; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } - ret = lock_extent_range(inode1, loff1, len, retry_range_locking); - if (ret) - return ret; - ret = lock_extent_range(inode2, loff2, len, retry_range_locking); - if (ret) - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, - loff1 + len - 1); - return ret; -} - -struct cmp_pages { - int num_pages; - struct page **src_pages; - struct page **dst_pages; -}; - -static void btrfs_cmp_data_free(struct cmp_pages *cmp) -{ - int i; - struct page *pg; - - for (i = 0; i < cmp->num_pages; i++) { - pg = cmp->src_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->src_pages[i] = NULL; - } - pg = cmp->dst_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->dst_pages[i] = NULL; - } - } -} - -static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, - struct inode *dst, u64 dst_loff, - u64 len, struct cmp_pages *cmp) -{ - int ret; - int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - - cmp->num_pages = num_pages; - - ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); - if (ret) - goto out; - - ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); - -out: - if (ret) - btrfs_cmp_data_free(cmp); - return ret; -} - -static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -{ - int ret = 0; - int i; - struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_SIZE; - void *addr, *dst_addr; - - i = 0; - while (len) { - if (len < PAGE_SIZE) - cmp_len = len; - - BUG_ON(i >= cmp->num_pages); - - src_page = cmp->src_pages[i]; - dst_page = cmp->dst_pages[i]; - ASSERT(PageLocked(src_page)); - ASSERT(PageLocked(dst_page)); - - addr = kmap_atomic(src_page); - dst_addr = kmap_atomic(dst_page); - - flush_dcache_page(src_page); - flush_dcache_page(dst_page); - - if (memcmp(addr, dst_addr, cmp_len)) - ret = -EBADE; - - kunmap_atomic(addr); - kunmap_atomic(dst_addr); - - if (ret) - break; - - len -= cmp_len; - i++; - } - - return ret; -} - -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, - u64 olen) -{ - u64 len = *plen; - u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - - if (off + olen > inode->i_size || off + olen < off) - return -EINVAL; - - /* if we extend to eof, continue to block boundary */ - if (off + len == inode->i_size) - *plen = len = ALIGN(inode->i_size, bs) - off; - - /* Check that we are block aligned - btrfs_clone() requires this */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) - return -EINVAL; - - return 0; -} - static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff, - struct cmp_pages *cmp) + struct inode *dst, u64 dst_loff) { + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; u64 len = olen; - bool same_inode = (src == dst); - u64 same_lock_start = 0; - u64 same_lock_len = 0; - - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - return ret; - - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - return ret; - - if (same_inode) { - /* - * Single inode case wants the same checks, except we - * don't want our length pushed out past i_size as - * comparing that data range makes no sense. - * - * extent_same_check_offsets() will do this for an - * unaligned length at i_size, so catch it here and - * reject the request. - * - * This effectively means we require aligned extents - * for the single-inode case, whereas the other cases - * allow an unaligned length so long as it ends at - * i_size. - */ - if (len != olen) - return -EINVAL; - - /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) - return -EINVAL; - - same_lock_start = min_t(u64, loff, dst_loff); - same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; - } else { - /* - * If the source and destination inodes are different, the - * source's range end offset matches the source's i_size, that - * i_size is not a multiple of the sector size, and the - * destination range does not go past the destination's i_size, - * we must round down the length to the nearest sector size - * multiple. If we don't do this adjustment we end replacing - * with zeroes the bytes in the range that starts at the - * deduplication range's end offset and ends at the next sector - * size multiple. - */ - if (loff + olen == i_size_read(src) && - dst_loff + len < i_size_read(dst)) { - const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; - - len = round_down(i_size_read(src), sz) - loff; - if (len == 0) - return 0; - olen = len; - } - } - -again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); - if (ret) - return ret; - if (same_inode) - ret = lock_extent_range(src, same_lock_start, same_lock_len, - false); - else - ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, - false); + if (loff + len == src->i_size) + len = ALIGN(src->i_size, bs) - loff; /* - * If one of the inodes has dirty pages in the respective range or - * ordered extents, we need to flush dellaloc and wait for all ordered - * extents in the range. We must unlock the pages and the ranges in the - * io trees to avoid deadlocks when flushing delalloc (requires locking - * pages) and when waiting for ordered extents to complete (they require - * range locking). + * For same inode case we don't want our length pushed out past i_size + * as comparing that data range makes no sense. + * + * This effectively means we require aligned extents for the single + * inode case, whereas the other cases allow an unaligned length so long + * as it ends at i_size. */ - if (ret == -EAGAIN) { - /* - * Ranges in the io trees already unlocked. Now unlock all - * pages before waiting for all IO to complete. - */ - btrfs_cmp_data_free(cmp); - if (same_inode) { - btrfs_wait_ordered_range(src, same_lock_start, - same_lock_len); - } else { - btrfs_wait_ordered_range(src, loff, len); - btrfs_wait_ordered_range(dst, dst_loff, len); - } - goto again; - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(cmp); - return ret; - } - - /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, cmp); - if (ret == 0) - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - - if (same_inode) - unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, - same_lock_start + same_lock_len - 1); - else - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + if (dst == src && len != olen) + return -EINVAL; - btrfs_cmp_data_free(cmp); + /* + * Lock destination range to serialize with concurrent readpages(). + */ + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); return ret; } @@ -3557,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - struct cmp_pages cmp; int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; - bool same_inode = (src == dst); u64 i, tail_len, chunk_count; - if (olen == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) + return -ETXTBSY; tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); if (chunk_count == 0) num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; - /* - * If deduping ranges in the same inode, locking rules make it - * mandatory to always lock pages in ascending order to avoid deadlocks - * with concurrent tasks (such as starting writeback/delalloc). - */ - if (same_inode && dst_loff < loff) - swap(loff, dst_loff); - - /* - * We must gather up all the pages before we initiate our extent - * locking. We use an array for the page pointers. Size of the array is - * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. - */ - cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - if (!cmp.src_pages || !cmp.dst_pages) { - ret = -ENOMEM; - goto out_free; - } - for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff, &cmp); + dst, dst_loff); if (ret) - goto out_free; + return ret; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3616,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff, &cmp); - -out_free: - kvfree(cmp.src_pages); - kvfree(cmp.dst_pages); - -out_unlock: - if (same_inode) - inode_unlock(src); - else - btrfs_double_inode_unlock(src, dst); + dst_loff); return ret; } @@ -4213,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 len = olen; u64 bs = fs_info->sb->s_blocksize; - int same_inode = src == inode; /* * TODO: @@ -4230,101 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - if (btrfs_root_readonly(root)) - return -EROFS; - - if (file_src->f_path.mnt != file->f_path.mnt || - src->i_sb != inode->i_sb) - return -EXDEV; - - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - return -EISDIR; - - if (!same_inode) { - btrfs_double_inode_lock(src, inode); - } else { - inode_lock(src); - } - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) + return -ETXTBSY; - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; /* - * If we extend to eof, continue to block boundary if and only if the - * destination end offset matches the destination file's size, otherwise - * we would be corrupting data by placing the eof block into the middle - * of a file. + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. */ - if (off + len == src->i_size) { - if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) - goto out_unlock; + if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; - } - - if (len == 0) { - ret = 0; - goto out_unlock; - } - - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; - - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (destoff + len > off && destoff < off + len) - goto out_unlock; - } if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) - goto out_unlock; + return ret; } /* - * Lock the target range too. Right after we replace the file extent - * items in the fs tree (which now point to the cloned data), we might - * have a worker replace them with extent items relative to a write - * operation that was issued before this clone operation (i.e. confront - * with inode.c:btrfs_finish_ordered_io). + * Lock destination range to serialize with concurrent readpages(). */ - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - - ret = lock_extent_range(src, lock_start, lock_len, true); - } else { - ret = btrfs_double_extent_lock(src, off, inode, destoff, len, - true); - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - goto out_unlock; - } - + lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_end = max_t(u64, off, destoff) + len - 1; - - unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); - } else { - btrfs_double_extent_unlock(src, off, inode, destoff, len); - } + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. @@ -4332,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); -out_unlock: + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + if (same_inode) + inode_lock(inode_in); + else + btrfs_double_inode_lock(inode_in, inode_out); + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); if (!same_inode) - btrfs_double_inode_unlock(src, inode); + inode_dio_wait(inode_out); + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + goto out_unlock; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) + goto out_unlock; + + return 0; + + out_unlock: + if (same_inode) + inode_unlock(inode_in); else - inode_unlock(src); + btrfs_double_inode_unlock(inode_in, inode_out); + return ret; } @@ -4344,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) { - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; - ret = btrfs_extent_same(src, off, len, dst, destoff); - } else { + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - } + + if (same_inode) + inode_unlock(src_inode); + else + btrfs_double_inode_unlock(src_inode, dst_inode); + return ret < 0 ? ret : len; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a4cc178bee..90639140439f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -27,7 +27,7 @@ * Records the total size (including the header) of compressed data. * * 2. Segment(s) - * Variable size. Each segment includes one segment header, followd by data + * Variable size. Each segment includes one segment header, followed by data * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..6fde2b2741ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; - bool dec_pending_ordered = false; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) - dec_pending_ordered = true; spin_unlock_irq(&tree->lock); - /* - * The current running transaction is waiting on us, we need to let it - * know that we're complete and wake it up. - */ - if (dec_pending_ordered) { - struct btrfs_transaction *trans; - - /* - * The checks for trans are just a formality, it should be set, - * but if it isn't we don't want to deref/assert under the spin - * lock, so be nice and check if trans is set, but ASSERT() so - * if it isn't set a developer will notice. - */ - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - ASSERT(trans); - if (trans) { - if (atomic_dec_and_test(&trans->pending_ordered)) - wake_up(&trans->pending_wait); - btrfs_put_transaction(trans); - } - } - spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..fb9a161f0215 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -37,28 +37,31 @@ struct btrfs_ordered_sum { * rbtree, just before waking any waiters. It is used to indicate the * IO is done and any metadata is inserted into the tree. */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ - -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent - * has done its due diligence in updating - * the isize. */ -#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ - -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to - * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ +enum { + /* set when all the pages are written */ + BTRFS_ORDERED_IO_DONE, + /* set when removed from the tree */ + BTRFS_ORDERED_COMPLETE, + /* set when we want to write in place */ + BTRFS_ORDERED_NOCOW, + /* writing a zlib compressed extent */ + BTRFS_ORDERED_COMPRESSED, + /* set when writing to preallocated extent */ + BTRFS_ORDERED_PREALLOC, + /* set when we're doing DIO with this extent */ + BTRFS_ORDERED_DIRECT, + /* We had an io error when writing this out */ + BTRFS_ORDERED_IOERR, + /* + * indicates whether this ordered extent has done its due diligence in + * updating the isize + */ + BTRFS_ORDERED_UPDATED_ISIZE, + /* Set when we have to truncate an extent */ + BTRFS_ORDERED_TRUNCATED, + /* Regular IO for COW */ + BTRFS_ORDERED_REGULAR, +}; struct btrfs_ordered_extent { /* logical offset in the file */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 45868fd76209..4e473a998219 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@ * - sync * - copy also limits on subvol creation * - limit - * - caches fuer ulists + * - caches for ulists * - performance benchmarks * - check all ioctl parameters */ @@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) __del_qgroup_rb(qgroup); } /* - * we call btrfs_free_qgroup_config() when umounting + * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * - * In this case, all exclsuive extents will also be exlusive for parent, so + * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to @@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. - * They should be marked during preivous (@dst_level = 1) iteration. + * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * - * This function can free us from keeping two pathes, thus later we only need + * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, @@ -1895,7 +1901,7 @@ out: * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. - * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, @@ -2020,7 +2026,7 @@ out: * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, * and skip all tree blocks whose generation is smaller than last_snapshot. * * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), @@ -2659,7 +2665,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int i; u64 *i_qgroups; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; @@ -2669,6 +2675,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; @@ -3103,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } - extent_buffer_get(scratch_leaf); - btrfs_tree_read_lock(scratch_leaf); - btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3131,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, goto out; } out: - if (scratch_leaf) { - btrfs_tree_read_unlock_blocking(scratch_leaf); + if (scratch_leaf) free_extent_buffer(scratch_leaf); - } if (done && !ret) { ret = 1; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d8f78f5ab854..20c6bd5fa701 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record { * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { - BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_DATA, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, @@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type { * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while - * *currently* meta is just reserve-and-clear during transcation. + * *currently* meta is just reserve-and-clear during transaction. * * TODO: Add new type for reservation which can survive transaction commit. - * Currect metadata reservation behavior is not suitable for such case. + * Current metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..e74455eb42f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1980,7 +1980,7 @@ cleanup_io: * - In case of single failure, where rbio->failb == -1: * * Cache this rbio iff the above read reconstruction is - * excuted without problems. + * executed without problems. */ if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dec14b739b10..10d9589001a9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + /* Insert extent in reada tree + all per-device trees, all or nothing */ + down_read(&fs_info->dev_replace.rwsem); ret = radix_tree_preload(GFP_KERNEL); - if (ret) + if (ret) { + up_read(&fs_info->dev_replace.rwsem); goto error; + } - /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_read_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } radix_tree_preload_end(); @@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); if (!have_zone) goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d69fbfb30aa9..c3557c12656b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -43,7 +43,7 @@ struct ref_entry { * back to the delayed ref action. We hold the ref we are changing in the * action so we can account for the history properly, and we record the root we * were called with since it could be different from ref_root. We also store - * stack traces because thats how I roll. + * stack traces because that's how I roll. */ struct ref_action { int action; @@ -56,7 +56,7 @@ struct ref_action { /* * One of these for every block we reference, it holds the roots and references - * to it as well as all of the ref actions that have occured to it. We never + * to it as well as all of the ref actions that have occurred to it. We never * free it until we unmount the file system in order to make sure re-allocations * are happening properly. */ @@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * This shouldn't happen because we will add our re * above when we lookup the be with !parent, but just in * case catch this case so we don't panic because I - * didn't thik of some other corner case. + * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", root->root_key.objectid, be->bytenr); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..272b287f8cf0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * @@ -3959,6 +3959,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans); + trans = NULL; continue; } @@ -4184,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0, "raid0"); - DESCRIBE_FLAG(RAID1, "raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5, "raid5"); - DESCRIBE_FLAG(RAID6, "raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* @@ -4222,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { + struct btrfs_block_group_cache *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4230,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + bg = btrfs_lookup_block_group(fs_info, group_start); + if (!bg) + return -ENOENT; + + if (btrfs_pinned_by_swapfile(fs_info, bg)) { + btrfs_put_block_group(bg); + return -ETXTBSY; + } + rc = alloc_reloc_control(); - if (!rc) + if (!rc) { + btrfs_put_block_group(bg); return -ENOMEM; + } rc->extent_root = extent_root; - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); + rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group); if (ret) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..6dcd36d7b849 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock( } } - /* Insert new lock */ + /* + * Insert new lock. + */ ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) scrub_free_ctx(sctx); } -static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + struct btrfs_fs_info *fs_info, int is_dev_replace) { struct scrub_ctx *sctx; int i; - struct btrfs_fs_info *fs_info = dev->fs_info; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) @@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; - sctx->fs_info = dev->fs_info; + sctx->fs_info = fs_info; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int page_num; int success; bool full_stripe_locked; + unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) dev = sblock_to_check->pagev[0]->dev; /* + * We must use GFP_NOFS because the scrub task might be waiting for a + * worker task executing this function and in turn a transaction commit + * might be waiting the scrub task to pause (which needs to wait for all + * the worker tasks to complete before pausing). + * We do allocations in the workers through insert_full_stripe_lock() + * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * this function. + */ + nofs_flag = memalloc_nofs_save(); + /* * For RAID5/6, race can happen for a different device scrub thread. * For data corruption, Parity and Data threads will both try * to recovery the data. @@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); if (ret < 0) { + memalloc_nofs_restore(nofs_flag); spin_lock(&sctx->stat_lock); if (ret == -ENOMEM) sctx->stat.malloc_errors++; @@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_NOFS); + sizeof(*sblocks_for_recheck), GFP_KERNEL); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1202,6 +1215,7 @@ out: } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + memalloc_nofs_restore(nofs_flag); if (ret < 0) return ret; return 0; @@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks - * that started dellaloc right before we set the block + * that started delalloc right before we set the block * group to RO mode, as they might have just allocated * an extent from it or decided they could do a nocow * write. And if any such tasks did that, wait for their @@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache); @@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&fs_info->dev_replace.rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + unsigned int nofs_flag; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; + ret = -ENODEV; + goto out_free_ctx; } if (!is_dev_replace && !readonly && @@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); - return -EROFS; + ret = -EROFS; + goto out_free_ctx; } mutex_lock(&fs_info->scrub_lock); @@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EIO; + ret = -EIO; + goto out_free_ctx; } - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EINPROGRESS; + ret = -EINPROGRESS; + goto out_free_ctx; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return ret; + goto out_free_ctx; } - sctx = scrub_setup_ctx(dev, is_dev_replace); - if (IS_ERR(sctx)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); - return PTR_ERR(sctx); - } sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); + /* + * In order to avoid deadlock with reclaim when there is a transaction + * trying to pause scrub, make sure we use GFP_NOFS for all the + * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * invoked by our callees. The pausing request is done when the + * transaction commit starts, and it blocks the transaction until scrub + * is paused (done at specific points at scrub_stripe() or right above + * before incrementing fs_info->scrubs_running). + */ + nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { /* * by holding device list mutex, we can @@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_put_ctx(sctx); return ret; + +out_free_ctx: + scrub_free_ctx(sctx); + + return ret; } void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 094cc1444a90..1b15b43905f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2238,7 +2238,7 @@ out: * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * - * When do we have have orphan inodes: + * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) kfree(m); } -static void tail_append_pending_moves(struct pending_dir_move *moves, +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves, list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } } static int apply_children_dir_moves(struct send_ctx *sctx) @@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) return 0; INIT_LIST_HEAD(&stack); - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); @@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); } return 0; @@ -3849,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We may have refs where the parent directory does not exist * yet. This happens if the parent directories inum is higher - * the the current inum. To handle this case, we create the + * than the current inum. To handle this case, we create the * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ @@ -4770,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_MASK; + unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; key.objectid = sctx->cur_ino; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cbc9d0d2c12d..368a5b9e6c13 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno) /* * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the approciate error response. + * invokes the appropriate error response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, @@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * although there is no way to update the progress. It would add the * risk of a deadlock, therefore the canceling is omitted. The only * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writeable + * completes. The next time when the filesystem is mounted writable * again, the device replace operation continues. */ } @@ -1848,7 +1848,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "too many missing devices, writeable remount is not allowed"); + "too many missing devices, writable remount is not allowed"); ret = -EACCES; goto restore; } @@ -2090,7 +2090,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; @@ -2237,6 +2237,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2311,7 +2312,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * device_list_mutex here as we only read the device data and the list * is protected by RCU. Even if a device is deleted during the list * traversals, we'll get valid data, the freeing callback will wait at - * least until until the rcu_read_unlock. + * least until the rcu_read_unlock. */ rcu_read_lock(); cur_devices = fs_info->fs_devices; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..5a5930e3d32b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { @@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj, BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%pU\n", + fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), + BTRFS_ATTR_PTR(, metadata_uuid), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c6ee600aff89..40716b357c1d 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT = 0, + FEAT_COMPAT, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index db72b3b6209e..8a59597f1883 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) + if (root->node) { + /* One for allocate_extent_buffer */ free_extent_buffer(root->node); + } kfree(root); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..3c46d7f23456 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = SZ_256M; - u64 max_bytes = SZ_128M; + /* In this test we need at least 2 file extents at its maximum size */ + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + u64 total_dirty = 2 * max_bytes; u64 start, end, test_start; - u64 found; + bool found; int ret = -EINVAL; test_msg("running find delalloc tests"); @@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, inode); + extent_io_tree_init(&tmp, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 64043f028820..af0c8e30d9e2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } - /* - * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidentally release our page. - */ - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); BTRFS_I(inode)->root = root; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1eeef9ec5da..127fa1535f58 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -233,14 +233,12 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); - atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); @@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; - + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, enforce_qgroups); if (ret) return ERR_PTR(ret); + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do 2 x num_items + * worth of delayed refs updates in this trans handle, and + * refill that amount for whatever is missing in the reserve. + */ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + if (delayed_refs_rsv->full == 0) { + delayed_refs_bytes = num_bytes; + num_bytes <<= 1; + } + /* * Do the reservation for the relocation root creation */ @@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - num_bytes, flush); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } @@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) /* * btrfs_attach_transaction_barrier() - catch the running transaction * - * It is similar to the above function, the differentia is this one + * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ @@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans)) + if (btrfs_check_space_for_delayed_refs(fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - int updates; - int err; smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED || cur_trans->delayed_refs.flushing) return 1; - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) /* Error code will also eval true */ - return err; - } - return should_end_transaction(trans); } @@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - u64 transid = trans->transid; - unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - int must_run_delayed_refs = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans); - trans->delayed_ref_updates = 0; - if (!trans->sync) { - must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans); - cur = max_t(unsigned long, cur, 32); - - /* - * don't make the caller wait if they are from a NOLOCK - * or ATTACH transaction, it will deadlock with commit - */ - if (must_run_delayed_refs == 1 && - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) - must_run_delayed_refs = 2; - } - - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); if (lock && should_end_transaction(trans) && @@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(info, cur, transid, - must_run_delayed_refs == 1); - } return err; } @@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return 0; /* - * Ensure dirty @src will be commited. Or, after comming + * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. @@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - DEFINE_WAIT(wait); WARN_ON(refcount_read(&trans->use_count) > 1); @@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ - wait_event(cur_trans->pending_wait, - atomic_read(&cur_trans->pending_ordered) == 0); -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(fs_info); - btrfs_wait_pending_ordered(cur_trans); - btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 4cbb1b55387d..f1ba78949d1b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,13 +12,13 @@ #include "ctree.h" enum btrfs_trans_state { - TRANS_STATE_RUNNING = 0, - TRANS_STATE_BLOCKED = 1, - TRANS_STATE_COMMIT_START = 2, - TRANS_STATE_COMMIT_DOING = 3, - TRANS_STATE_UNBLOCKED = 4, - TRANS_STATE_COMPLETED = 5, - TRANS_STATE_MAX = 6, + TRANS_STATE_RUNNING, + TRANS_STATE_BLOCKED, + TRANS_STATE_COMMIT_START, + TRANS_STATE_COMMIT_DOING, + TRANS_STATE_UNBLOCKED, + TRANS_STATE_COMPLETED, + TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 @@ -39,7 +39,6 @@ struct btrfs_transaction { */ atomic_t num_writers; refcount_t use_count; - atomic_t pending_ordered; unsigned long flags; @@ -51,7 +50,6 @@ struct btrfs_transaction { time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; - wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; struct list_head switch_commits; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index efcf89a8ba44..a62e1e837a89 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,10 +27,10 @@ * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. - * It's recommened to decode key.objecitd/offset if it's + * It's recommended to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error - * @bad_value: optional, it's recommened to output bad value and its + * @bad_value: optional, it's recommended to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed @@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, } /* - * Support for new compression/encrption must introduce incompat flag, + * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { @@ -389,13 +389,11 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, /* * Here we don't really care about alignment since extent allocator can - * handle it. We care more about the size, as if one block group is - * larger than maximum size, it's must be some obvious corruption. + * handle it. We care more about the size. */ - if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) { + if (key->offset == 0) { block_group_err(fs_info, leaf, slot, - "invalid block group size, have %llu expect (0, %llu]", - key->offset, BTRFS_MAX_DATA_CHUNK_SIZE); + "invalid block group size 0"); return -EUCLEAN; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5ce99a6c936..ac232b3d6d7e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1144,7 +1144,7 @@ next: } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { @@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer @@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(log->fs_info, ret, NULL); } - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - } - + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); kfree(log); } @@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4392,8 +4379,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { /* @@ -4434,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(&inode->vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5778,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 767765031e59..0fab84a8f670 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -15,7 +15,6 @@ struct btrfs_log_ctx { int log_ret; int log_transid; - int io_err; bool log_new_dentries; struct inode *inode; struct list_head list; @@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, { ctx->log_ret = 0; ctx->log_transid = 0; - ctx->io_err = 0; ctx->log_new_dentries = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..2576b1a379c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, @@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, @@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, @@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, - .ncopies = 2, + .ncopies = 1, + .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, - .ncopies = 3, + .ncopies = 1, + .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, @@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); @@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * @@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void) /* * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devs; @@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + if (metadata_fsid) + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); + else if (fsid) + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + return fs_devs; } @@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, return NULL; } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; + ASSERT(fsid); + + if (metadata_fsid) { + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(metadata_fsid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + } + + /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; + if (metadata_fsid) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 + && memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } else { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } } return NULL; } @@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_brelse; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = 1; } else { @@ -744,6 +863,51 @@ error_brelse: } /* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + return fs_devices; + } + } + + return NULL; +} + + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but curently device didn't + * observe it. Meaning our fsid will be different than theirs. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) { + return fs_devices; + } + } + + return NULL; +} +/* * Add new device to list of registered devices * * Returns: @@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path, bool *new_device_added) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) { + /* + * When we have an image which has CHANGING_FSID_V2 set + * it might belong to either a filesystem which has + * disks with completed fsid change or it might belong + * to fs with no UUID changes in effect, handle both. + */ + fs_devices = find_fsid_inprogress(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } else { + fs_devices = find_fsid_changed(disk_super); + } + } else if (has_metadata_uuid) { + fs_devices = find_fsid(disk_super->fsid, + disk_super->metadata_uuid); + } else { + fs_devices = find_fsid(disk_super->fsid, NULL); + } + - fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + fs_devices = alloc_fs_devices(disk_super->fsid, + disk_super->metadata_uuid); + else + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + + fs_devices->fsid_change = fsid_change_in_progress; + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = find_device(fs_devices, devid, disk_super->dev_item.uuid); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (has_metadata_uuid && fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + fs_devices->fsid_change = false; + } } if (!device) { @@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); @@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, * it back. We need it to pick the disk with largest generation * (as above). */ - if (!fs_devices->opened) + if (!fs_devices->opened) { device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = alloc_fs_devices(orig->fsid); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, p = kmap(*page); /* align our pointer to the offset of the super block */ - *disk_super = p + (bytenr & ~PAGE_MASK); + *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { @@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return num_devices; } @@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, goto out; } + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + rcu_str_deref(device->name), device->devid); + ret = -ETXTBSY; + goto out; + } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; @@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path( disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + device = btrfs_find_device(fs_info, devid, dev_uuid, + disk_super->metadata_uuid); + else + device = btrfs_find_device(fs_info, devid, + dev_uuid, disk_super->fsid); + brelse(bh); if (!device) device = ERR_PTR(-ENOENT); @@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; - seed_devices = alloc_fs_devices(NULL); + seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); @@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); mutex_unlock(&fs_devices->device_list_mutex); @@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * so rename the fsid on the sysfs */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fsid); + fs_info->fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_info, "sysfs: failed to create fsid for sprout"); @@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the @@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(&fs_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(&fs_devices->device_list_mutex); @@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; int slot; int ret; @@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_sys = 0; int chunk_reserved = 0; - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = btrfs_device_get_total_bytes(device); - size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || - btrfs_device_get_total_bytes(device) - - btrfs_device_get_bytes_used(device) > size_to_free || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - if (ret) { - /* btrfs_shrink_device never returns ret > 0 */ - WARN_ON(ret > 0); - goto error; - } - - trans = btrfs_start_transaction(dev_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_info_in_rcu(fs_info, - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - ret = btrfs_grow_device(trans, device, old_size); - if (ret) { - btrfs_end_transaction(trans); - /* btrfs_grow_device never returns ret > 0 */ - WARN_ON(ret > 0); - btrfs_info_in_rcu(fs_info, - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - btrfs_end_transaction(trans); - } - - /* step two, relocate all the chunks */ path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3638,10 +3848,15 @@ again: ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto error; if (ret == -ENOSPC) { enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; @@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) { + int index = btrfs_bg_flags_to_raid_index(bargs->target); + + CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); + } + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); +} + +/* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { @@ -3887,10 +4273,8 @@ static int balance_kthread(void *data) int ret = 0; mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "balance: resuming"); + if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); - } mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4433,10 +4817,16 @@ again: ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } } while (key.offset-- > 0); if (failed && !retried) { @@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int devs_min; /* min devs needed */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to + store parity information */ int ret; u64 max_stripe_size; u64 max_chunk_size; u64 stripe_size; - u64 num_bytes; + u64 chunk_size; int ndevs; int i; int j; @@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; + nparity = btrfs_raid_array[index].nparity; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; @@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(1); } - /* we don't want a chunk larger than 10% of writeable space */ + /* We don't want a chunk larger than 10% of writable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); @@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * this will have to be fixed for RAID1 and RAID10 over * more drives */ - data_stripes = num_stripes / ncopies; - - if (type & BTRFS_BLOCK_GROUP_RAID5) - data_stripes = num_stripes - 1; - - if (type & BTRFS_BLOCK_GROUP_RAID6) - data_stripes = num_stripes - 2; + data_stripes = (num_stripes - nparity) / ncopies; /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } @@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - num_bytes = stripe_size * data_stripes; + chunk_size = stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, chunk_size); em = alloc_extent_map(); if (!em) { @@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = num_bytes; + em->len = chunk_size; em->block_start = 0; em->block_len = em->len; em->orig_block_len = stripe_size; @@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) { - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); - } + for (i = 0; i < map->num_stripes; i++) + btrfs_device_set_bytes_used(map->stripes[i].dev, + map->stripes[i].dev->bytes_used + stripe_size); atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em = get_chunk_map(fs_info, chunk_offset, chunk_size); + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(em)) return PTR_ERR(em); @@ -4971,10 +5355,10 @@ out: } /* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the * bootstrap process of adding storage to a seed btrfs. */ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) @@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) int miss_ndevs = 0; int i; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) return 1; @@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) /* * We could return errors for these cases, but that could get @@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return ret; } @@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, /* discard always return a bbio */ ASSERT(bbio_ret); - em = get_chunk_map(fs_info, logical, length); + em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = get_chunk_map(fs_info, logical, *length); + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, *length = em->len - offset; } - /* This is for when we're called from btrfs_merge_bio_hook() and all - it cares about is the length */ + /* + * This is for when we're called from btrfs_bio_fits_in_stripe and all + * it cares about is the length + */ if (!bbio_ret) goto out; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ if (!dev_replace_is_ongoing) - btrfs_dev_replace_read_unlock(dev_replace); - else - btrfs_dev_replace_set_lock_blocking(dev_replace); + up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - btrfs_dev_replace_read_lock(dev_replace); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); - btrfs_dev_replace_read_unlock(dev_replace); + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); } free_extent_map(em); return ret; @@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 rmap_len; int i, j, nr = 0; - em = get_chunk_map(fs_info, chunk_start, 1); + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); if (IS_ERR(em)) return -EIO; @@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev) { - bio_io_error(bio); - return; - } - /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { btrfsic_submit_bio(bio); @@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, + &dev->dev_state) || (bio_op(first_bio) == REQ_OP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); @@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { + !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { device = find_device(cur_devices, devid, uuid); if (device) return device; @@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = fs_devices->seed; } - fs_devices = find_fsid(fsid); + fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid); + fs_devices = alloc_fs_devices(fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + "chunk %llu missing %d devices, max tolerance is %d for writable mount", em->start, missing, max_tolerated); free_extent_map(em); ret = false; @@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; struct extent_map *em; struct map_lookup *map; + struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; @@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, physical_offset, devid); ret = -EUCLEAN; } + + /* Make sure no dev extent is beyond device bondary */ + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find devid %llu", devid); + ret = -EUCLEAN; + goto out; + } + if (physical_offset + physical_len > dev->disk_total_bytes) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", + devid, physical_offset, physical_len, + dev->disk_total_bytes); + ret = -EUCLEAN; + goto out; + } out: free_extent_map(em); return ret; @@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -7541,3 +7952,27 @@ out: btrfs_free_path(path); return ret; } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..ed806649a473 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + bool fsid_change; struct list_head fs_list; u64 num_devices; @@ -218,6 +220,10 @@ struct btrfs_fs_devices { u64 missing_devices; u64 total_rw_bytes; u64 total_devices; + + /* Highest generation number of seen devices */ + u64 latest_generation; + struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex @@ -261,15 +267,12 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ -typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned int mirror_num; unsigned int stripe_index; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - u8 *csum_allocated; - btrfs_io_bio_end_io_t *end_io; struct bvec_iter iter; /* * This member must come last, bio_alloc_bioset will allocate enough @@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) return container_of(bio, struct btrfs_io_bio, bio); } +static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +{ + if (io_bio->csum != io_bio->csum_inline) { + kfree(io_bio->csum); + io_bio->csum = NULL; + } +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); - struct btrfs_bio { refcount_t refs; atomic_t stripes_pending; @@ -331,6 +339,8 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to store + * parity information */ int mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ @@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); @@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } |