diff options
Diffstat (limited to 'fs/btrfs')
47 files changed, 3025 insertions, 2197 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 68ebe188446a..78556447e1d5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node)  }  /* - * We maintain three seperate rbtrees: one for direct refs, one for + * We maintain three separate rbtrees: one for direct refs, one for   * indirect refs which have a key, and one for indirect refs which do not   * have a key. Each tree does merge on insertion.   * @@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,  		}  		/* -		 * Now it's a direct ref, put it in the the direct tree. We must +		 * Now it's a direct ref, put it in the direct tree. We must  		 * do this last because the ref could be merged/freed here.  		 */  		prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); @@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,  			ret = -ENOMEM;  			break;  		} -		extent_buffer_get(eb); -		btrfs_tree_read_lock(eb); -		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);  		btrfs_release_path(path);  		item = btrfs_item_nr(slot); @@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,  			len = sizeof(*iref) + name_len;  			iref = (struct btrfs_inode_ref *)((char *)iref + len);  		} -		btrfs_tree_read_unlock_blocking(eb);  		free_extent_buffer(eb);  	} @@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,  			ret = -ENOMEM;  			break;  		} -		extent_buffer_get(eb); - -		btrfs_tree_read_lock(eb); -		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);  		btrfs_release_path(path);  		item_size = btrfs_item_size_nr(eb, slot); @@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,  			cur_offset += btrfs_inode_extref_name_len(eb, extref);  			cur_offset += sizeof(*extref);  		} -		btrfs_tree_read_unlock_blocking(eb);  		free_extent_buffer(eb);  		offset++; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 97d91e55b70a..6f5d07415dab 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,7 @@   * new data the application may have written before commit.   */  enum { -	BTRFS_INODE_ORDERED_DATA_CLOSE = 0, +	BTRFS_INODE_ORDERED_DATA_CLOSE,  	BTRFS_INODE_DUMMY,  	BTRFS_INODE_IN_DEFRAG,  	BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -29,6 +29,7 @@ enum {  	BTRFS_INODE_IN_DELALLOC_LIST,  	BTRFS_INODE_READDIO_NEED_LOCK,  	BTRFS_INODE_HAS_PROPS, +	BTRFS_INODE_SNAPSHOT_FLUSH,  };  /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode {  	u64 last_unlink_trans;  	/* +	 * Track the transaction id of the last transaction used to create a +	 * hard link for the inode. This is used by the log tree (fsync). +	 */ +	u64 last_link_trans; + +	/*  	 * Number of bytes outstanding that are going to need csums.  This is  	 * used in ENOSPC accounting.  	 */ @@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)  	return false;  } +static inline bool is_data_inode(struct inode *inode) +{ +	return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} +  static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,  						 int mod)  { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 2e43fba44035..b0c8094528d1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data(  	void *dstv, u32 offset, size_t len)  {  	size_t cur; -	size_t offset_in_page; +	size_t pgoff;  	char *kaddr;  	char *dst = (char *)dstv; -	size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(block_ctx->start);  	unsigned long i = (start_offset + offset) >> PAGE_SHIFT;  	WARN_ON(offset + len > block_ctx->len); -	offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); +	pgoff = offset_in_page(start_offset + offset);  	while (len > 0) { -		cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); +		cur = min(len, ((size_t)PAGE_SIZE - pgoff));  		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));  		kaddr = block_ctx->datav[i]; -		memcpy(dst, kaddr + offset_in_page, cur); +		memcpy(dst, kaddr + pgoff, cur);  		dst += cur;  		len -= cur; -		offset_in_page = 0; +		pgoff = 0;  		i++;  	}  } @@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,  	BUG_ON(block_ctx->datav);  	BUG_ON(block_ctx->pagev);  	BUG_ON(block_ctx->mem_to_free); -	if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { +	if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {  		pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",  		       block_ctx->dev_bytenr);  		return -1; @@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,  	num_pages = state->metablock_size >> PAGE_SHIFT;  	h = (struct btrfs_header *)datav[0]; -	if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) +	if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))  		return 1;  	for (i = 0; i < num_pages; i++) { @@ -1778,7 +1778,7 @@ again:  				return;  			}  			is_metadata = 1; -			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); +			BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));  			processed_len = BTRFS_SUPER_INFO_SIZE;  			if (state->print_mask &  			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,  		 * write operations. Therefore it keeps the linkage  		 * information for a block until a block is  		 * rewritten. This can temporarily cause incorrect -		 * and even circular linkage informations. This +		 * and even circular linkage information. This  		 * causes no harm unless such blocks are referenced  		 * by the most recent super block.  		 */ @@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,  	struct list_head *dev_head = &fs_devices->devices;  	struct btrfs_device *device; -	if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { +	if (!PAGE_ALIGNED(fs_info->nodesize)) {  		pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",  		       fs_info->nodesize, PAGE_SIZE);  		return -1;  	} -	if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { +	if (!PAGE_ALIGNED(fs_info->sectorsize)) {  		pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",  		       fs_info->sectorsize, PAGE_SIZE);  		return -1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2955a4ea2fa8..548057630b69 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode,   */  static void end_compressed_bio_write(struct bio *bio)  { -	struct extent_io_tree *tree;  	struct compressed_bio *cb = bio->bi_private;  	struct inode *inode;  	struct page *page; @@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio)  	 * call back into the FS and do all the end_io operations  	 */  	inode = cb->inode; -	tree = &BTRFS_I(inode)->io_tree;  	cb->compressed_pages[0]->mapping = cb->inode->i_mapping; -	tree->ops->writepage_end_io_hook(cb->compressed_pages[0], -					 cb->start, -					 cb->start + cb->len - 1, -					 NULL, -					 bio->bi_status ? -					 BLK_STS_OK : BLK_STS_NOTSUPP); +	btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], +			cb->start, cb->start + cb->len - 1, +			bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP);  	cb->compressed_pages[0]->mapping = NULL;  	end_compressed_writeback(inode, cb); @@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,  	blk_status_t ret;  	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -	WARN_ON(start & ((u64)PAGE_SIZE - 1)); +	WARN_ON(!PAGE_ALIGNED(start));  	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);  	if (!cb)  		return BLK_STS_RESOURCE; @@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,  		page = compressed_pages[pg_index];  		page->mapping = inode->i_mapping;  		if (bio->bi_iter.bi_size) -			submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); +			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, +							  0);  		page->mapping = NULL;  		if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,  		if (page->index == end_index) {  			char *userpage; -			size_t zero_offset = isize & (PAGE_SIZE - 1); +			size_t zero_offset = offset_in_page(isize);  			if (zero_offset) {  				int zeros; @@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  		page->index = em_start >> PAGE_SHIFT;  		if (comp_bio->bi_iter.bi_size) -			submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, -					comp_bio, 0); +			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, +							  comp_bio, 0);  		page->mapping = NULL;  		if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,  /*   * Shannon Entropy calculation   * - * Pure byte distribution analysis fails to determine compressiability of data. + * Pure byte distribution analysis fails to determine compressibility of data.   * Try calculating entropy to estimate the average minimum number of bits   * needed to encode the sampled data.   * @@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) {  /*   * Use 4 bits as radix base - * Use 16 u32 counters for calculating new possition in buf array + * Use 16 u32 counters for calculating new position in buf array   *   * @array     - array that will be sorted   * @array_buf - buffer array to store sorting results diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 539901fb5165..d92462fe66c8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -12,6 +12,7 @@  #include "transaction.h"  #include "print-tree.h"  #include "locking.h" +#include "volumes.h"  static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root  		      *root, struct btrfs_path *path, int level); @@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	else  		btrfs_set_header_owner(cow, new_root_objectid); -	write_extent_buffer_fsid(cow, fs_info->fsid); +	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);  	WARN_ON(btrfs_header_generation(buf) > trans->transid);  	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	else  		btrfs_set_header_owner(cow, root->root_key.objectid); -	write_extent_buffer_fsid(cow, fs_info->fsid); +	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);  	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);  	if (ret) { @@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,  	btrfs_tree_read_unlock_blocking(eb);  	free_extent_buffer(eb); -	extent_buffer_get(eb_rewin);  	btrfs_tree_read_lock(eb_rewin);  	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);  	WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)  	if (!eb)  		return NULL; -	extent_buffer_get(eb);  	btrfs_tree_read_lock(eb);  	if (old_root) {  		btrfs_set_header_bytenr(eb, eb->start); @@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  	 *  	 * What is forced COW:  	 *    when we create snapshot during committing the transaction, -	 *    after we've finished coping src root, we must COW the shared +	 *    after we've finished copying src root, we must COW the shared  	 *    block to ensure the metadata consistency.  	 */  	if (btrfs_header_generation(buf) == trans->transid && @@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,  	u64 search_start;  	int ret; +	if (test_bit(BTRFS_ROOT_DELETING, &root->state)) +		btrfs_err(fs_info, +			"COW'ing blocks on a fs root that's being dropped"); +  	if (trans->transaction != fs_info->running_transaction)  		WARN(1, KERN_CRIT "trans %llu running %llu\n",  		       trans->transid, @@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,  	root_lock = BTRFS_READ_LOCK;  	if (p->search_commit_root) { -		/* The commit roots are read only so we always do read locks */ -		if (p->need_commit_sem) +		/* +		 * The commit roots are read only so we always do read locks, +		 * and we always must hold the commit_root_sem when doing +		 * searches on them, the only exception is send where we don't +		 * want to block transaction commits for a long time, so +		 * we need to clone the commit root in order to avoid races +		 * with transaction commits that create a snapshot of one of +		 * the roots used by a send operation. +		 */ +		if (p->need_commit_sem) {  			down_read(&fs_info->commit_root_sem); -		b = root->commit_root; -		extent_buffer_get(b); -		level = btrfs_header_level(b); -		if (p->need_commit_sem) +			b = btrfs_clone_extent_buffer(root->commit_root);  			up_read(&fs_info->commit_root_sem); +			if (!b) +				return ERR_PTR(-ENOMEM); + +		} else { +			b = root->commit_root; +			extent_buffer_get(b); +		} +		level = btrfs_header_level(b);  		/*  		 * Ensure that all callers have set skip_locking when  		 * p->search_commit_root = 1. @@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,  again:  	prev_cmp = -1;  	b = btrfs_search_slot_get_root(root, p, write_lock_level); +	if (IS_ERR(b)) { +		ret = PTR_ERR(b); +		goto done; +	}  	while (b) {  		level = btrfs_header_level(b); @@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root  		/* Key greater than all keys in the leaf, right neighbor has  		 * enough room for it and we're not emptying our leaf to delete  		 * it, therefore use right neighbor to insert the new item and -		 * no need to touch/dirty our left leaft. */ +		 * no need to touch/dirty our left leaf. */  		btrfs_tree_unlock(left);  		free_extent_buffer(left);  		path->nodes[0] = right; @@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  		ret = -ENOMEM;  		goto out;  	} -	extent_buffer_get(left_path->nodes[left_level]);  	right_level = btrfs_header_level(right_root->commit_root);  	right_root_level = right_level; @@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  		ret = -ENOMEM;  		goto out;  	} -	extent_buffer_get(right_path->nodes[right_level]);  	up_read(&fs_info->commit_root_sem);  	if (left_level == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68f322f600a0..f031a447a047 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)  }  /* - * File system states + * Runtime (in-memory) states of filesystem   */ -#define BTRFS_FS_STATE_ERROR		0 -#define BTRFS_FS_STATE_REMOUNTING	1 -#define BTRFS_FS_STATE_TRANS_ABORTED	2 -#define BTRFS_FS_STATE_DEV_REPLACING	3 -#define BTRFS_FS_STATE_DUMMY_FS_INFO	4 +enum { +	/* Global indicator of serious filesystem errors */ +	BTRFS_FS_STATE_ERROR, +	/* +	 * Filesystem is being remounted, allow to skip some operations, like +	 * defrag +	 */ +	BTRFS_FS_STATE_REMOUNTING, +	/* Track if a transaction abort has been reported on this filesystem */ +	BTRFS_FS_STATE_TRANS_ABORTED, +	/* +	 * Bio operations should be blocked on this filesystem because a source +	 * or target device is being destroyed as part of a device replace +	 */ +	BTRFS_FS_STATE_DEV_REPLACING, +	/* The btrfs_fs_info created for self-tests */ +	BTRFS_FS_STATE_DUMMY_FS_INFO, +};  #define BTRFS_BACKREF_REV_MAX		256  #define BTRFS_BACKREF_REV_SHIFT		56 @@ -195,9 +208,10 @@ struct btrfs_root_backup {   * it currently lacks any block count etc etc   */  struct btrfs_super_block { -	u8 csum[BTRFS_CSUM_SIZE];  	/* the first 4 fields must match struct btrfs_header */ -	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */ +	u8 csum[BTRFS_CSUM_SIZE]; +	/* FS specific UUID, visible to user */ +	u8 fsid[BTRFS_FSID_SIZE];  	__le64 bytenr; /* this block number */  	__le64 flags; @@ -234,8 +248,11 @@ struct btrfs_super_block {  	__le64 cache_generation;  	__le64 uuid_tree_generation; +	/* the UUID written into btree blocks */ +	u8 metadata_uuid[BTRFS_FSID_SIZE]; +  	/* future expansion */ -	__le64 reserved[30]; +	__le64 reserved[28];  	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];  	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];  } __attribute__ ((__packed__)); @@ -265,7 +282,8 @@ struct btrfs_super_block {  	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\  	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\  	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\ -	 BTRFS_FEATURE_INCOMPAT_NO_HOLES) +	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\ +	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID)  #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\  	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -316,7 +334,7 @@ struct btrfs_node {   * The slots array records the index of the item or block pointer   * used while walking the tree.   */ -enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; +enum { READA_NONE, READA_BACK, READA_FORWARD };  struct btrfs_path {  	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];  	int slots[BTRFS_MAX_LEVEL]; @@ -360,9 +378,7 @@ struct btrfs_dev_replace {  	struct btrfs_device *tgtdev;  	struct mutex lock_finishing_cancel_unmount; -	rwlock_t lock; -	atomic_t blocking_readers; -	wait_queue_head_t read_lock_wq; +	struct rw_semaphore rwsem;  	struct btrfs_scrub_progress scrub_progress; @@ -443,13 +459,19 @@ struct btrfs_space_info {  	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];  }; -#define	BTRFS_BLOCK_RSV_GLOBAL		1 -#define	BTRFS_BLOCK_RSV_DELALLOC	2 -#define	BTRFS_BLOCK_RSV_TRANS		3 -#define	BTRFS_BLOCK_RSV_CHUNK		4 -#define	BTRFS_BLOCK_RSV_DELOPS		5 -#define	BTRFS_BLOCK_RSV_EMPTY		6 -#define	BTRFS_BLOCK_RSV_TEMP		7 +/* + * Types of block reserves + */ +enum { +	BTRFS_BLOCK_RSV_GLOBAL, +	BTRFS_BLOCK_RSV_DELALLOC, +	BTRFS_BLOCK_RSV_TRANS, +	BTRFS_BLOCK_RSV_CHUNK, +	BTRFS_BLOCK_RSV_DELOPS, +	BTRFS_BLOCK_RSV_DELREFS, +	BTRFS_BLOCK_RSV_EMPTY, +	BTRFS_BLOCK_RSV_TEMP, +};  struct btrfs_block_rsv {  	u64 size; @@ -509,18 +531,18 @@ struct btrfs_free_cluster {  };  enum btrfs_caching_type { -	BTRFS_CACHE_NO		= 0, -	BTRFS_CACHE_STARTED	= 1, -	BTRFS_CACHE_FAST	= 2, -	BTRFS_CACHE_FINISHED	= 3, -	BTRFS_CACHE_ERROR	= 4, +	BTRFS_CACHE_NO, +	BTRFS_CACHE_STARTED, +	BTRFS_CACHE_FAST, +	BTRFS_CACHE_FINISHED, +	BTRFS_CACHE_ERROR,  };  enum btrfs_disk_cache_state { -	BTRFS_DC_WRITTEN	= 0, -	BTRFS_DC_ERROR		= 1, -	BTRFS_DC_CLEAR		= 2, -	BTRFS_DC_SETUP		= 3, +	BTRFS_DC_WRITTEN, +	BTRFS_DC_ERROR, +	BTRFS_DC_CLEAR, +	BTRFS_DC_SETUP,  };  struct btrfs_caching_control { @@ -712,41 +734,61 @@ struct btrfs_fs_devices;  struct btrfs_balance_control;  struct btrfs_delayed_root; -#define BTRFS_FS_BARRIER			1 -#define BTRFS_FS_CLOSING_START			2 -#define BTRFS_FS_CLOSING_DONE			3 -#define BTRFS_FS_LOG_RECOVERING			4 -#define BTRFS_FS_OPEN				5 -#define BTRFS_FS_QUOTA_ENABLED			6 -#define BTRFS_FS_UPDATE_UUID_TREE_GEN		9 -#define BTRFS_FS_CREATING_FREE_SPACE_TREE	10 -#define BTRFS_FS_BTREE_ERR			11 -#define BTRFS_FS_LOG1_ERR			12 -#define BTRFS_FS_LOG2_ERR			13 -#define BTRFS_FS_QUOTA_OVERRIDE			14 -/* Used to record internally whether fs has been frozen */ -#define BTRFS_FS_FROZEN				15 - -/* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ -#define BTRFS_FS_EXCL_OP			16 -  /* - * To info transaction_kthread we need an immediate commit so it doesn't - * need to wait for commit_interval + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree.   */ -#define BTRFS_FS_NEED_ASYNC_COMMIT		17 +struct btrfs_swapfile_pin { +	struct rb_node node; +	void *ptr; +	struct inode *inode; +	/* +	 * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, +	 * ptr points to a struct btrfs_device. +	 */ +	bool is_block_group; +}; -/* - * Indicate that balance has been set up from the ioctl and is in the main - * phase. The fs_info::balance_ctl is initialized. - */ -#define BTRFS_FS_BALANCE_RUNNING		18 +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + +enum { +	BTRFS_FS_BARRIER, +	BTRFS_FS_CLOSING_START, +	BTRFS_FS_CLOSING_DONE, +	BTRFS_FS_LOG_RECOVERING, +	BTRFS_FS_OPEN, +	BTRFS_FS_QUOTA_ENABLED, +	BTRFS_FS_UPDATE_UUID_TREE_GEN, +	BTRFS_FS_CREATING_FREE_SPACE_TREE, +	BTRFS_FS_BTREE_ERR, +	BTRFS_FS_LOG1_ERR, +	BTRFS_FS_LOG2_ERR, +	BTRFS_FS_QUOTA_OVERRIDE, +	/* Used to record internally whether fs has been frozen */ +	BTRFS_FS_FROZEN, +	/* +	 * Indicate that a whole-filesystem exclusive operation is running +	 * (device replace, resize, device add/delete, balance) +	 */ +	BTRFS_FS_EXCL_OP, +	/* +	 * To info transaction_kthread we need an immediate commit so it +	 * doesn't need to wait for commit_interval +	 */ +	BTRFS_FS_NEED_ASYNC_COMMIT, +	/* +	 * Indicate that balance has been set up from the ioctl and is in the +	 * main phase. The fs_info::balance_ctl is initialized. +	 */ +	BTRFS_FS_BALANCE_RUNNING, +};  struct btrfs_fs_info { -	u8 fsid[BTRFS_FSID_SIZE];  	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];  	unsigned long flags;  	struct btrfs_root *extent_root; @@ -790,6 +832,8 @@ struct btrfs_fs_info {  	struct btrfs_block_rsv chunk_block_rsv;  	/* block reservation for delayed operations */  	struct btrfs_block_rsv delayed_block_rsv; +	/* block reservation for delayed refs */ +	struct btrfs_block_rsv delayed_refs_rsv;  	struct btrfs_block_rsv empty_block_rsv; @@ -1114,6 +1158,10 @@ struct btrfs_fs_info {  	u32 sectorsize;  	u32 stripesize; +	/* Block groups and devices containing active swapfiles. */ +	spinlock_t swapfile_pins_lock; +	struct rb_root swapfile_pins; +  #ifdef CONFIG_BTRFS_FS_REF_VERIFY  	spinlock_t ref_verify_lock;  	struct rb_root block_tree; @@ -1133,22 +1181,24 @@ struct btrfs_subvolume_writers {  /*   * The state of btrfs root   */ -/* - * btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code.   But the - * race is very small, and only the first time the root - * is added to each transaction.  So IN_TRANS_SETUP - * is used to tell us when more checks are required - */ -#define BTRFS_ROOT_IN_TRANS_SETUP	0 -#define BTRFS_ROOT_REF_COWS		1 -#define BTRFS_ROOT_TRACK_DIRTY		2 -#define BTRFS_ROOT_IN_RADIX		3 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED	4 -#define BTRFS_ROOT_DEFRAG_RUNNING	5 -#define BTRFS_ROOT_FORCE_COW		6 -#define BTRFS_ROOT_MULTI_LOG_TASKS	7 -#define BTRFS_ROOT_DIRTY		8 +enum { +	/* +	 * btrfs_record_root_in_trans is a multi-step process, and it can race +	 * with the balancing code.   But the race is very small, and only the +	 * first time the root is added to each transaction.  So IN_TRANS_SETUP +	 * is used to tell us when more checks are required +	 */ +	BTRFS_ROOT_IN_TRANS_SETUP, +	BTRFS_ROOT_REF_COWS, +	BTRFS_ROOT_TRACK_DIRTY, +	BTRFS_ROOT_IN_RADIX, +	BTRFS_ROOT_ORPHAN_ITEM_INSERTED, +	BTRFS_ROOT_DEFRAG_RUNNING, +	BTRFS_ROOT_FORCE_COW, +	BTRFS_ROOT_MULTI_LOG_TASKS, +	BTRFS_ROOT_DIRTY, +	BTRFS_ROOT_DELETING, +};  /*   * in ram representation of the tree.  extent_root is used for all allocations @@ -1274,6 +1324,9 @@ struct btrfs_root {  	u64 qgroup_meta_rsv_pertrans;  	u64 qgroup_meta_rsv_prealloc; +	/* Number of active swapfiles */ +	atomic_t nr_swapfiles; +  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  	u64 alloc_bytenr;  #endif @@ -2570,10 +2623,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)  /* extent-tree.c */  enum btrfs_inline_ref_type { -	BTRFS_REF_TYPE_INVALID =	 0, -	BTRFS_REF_TYPE_BLOCK =		 1, -	BTRFS_REF_TYPE_DATA =		 2, -	BTRFS_REF_TYPE_ANY =		 3, +	BTRFS_REF_TYPE_INVALID, +	BTRFS_REF_TYPE_BLOCK, +	BTRFS_REF_TYPE_DATA, +	BTRFS_REF_TYPE_ANY,  };  int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2599,7 +2652,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,  }  int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,  					 const u64 start);  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2713,10 +2766,12 @@ enum btrfs_reserve_flush_enum {  enum btrfs_flush_state {  	FLUSH_DELAYED_ITEMS_NR	=	1,  	FLUSH_DELAYED_ITEMS	=	2, -	FLUSH_DELALLOC		=	3, -	FLUSH_DELALLOC_WAIT	=	4, -	ALLOC_CHUNK		=	5, -	COMMIT_TRANS		=	6, +	FLUSH_DELAYED_REFS_NR	=	3, +	FLUSH_DELAYED_REFS	=	4, +	FLUSH_DELALLOC		=	5, +	FLUSH_DELALLOC_WAIT	=	6, +	ALLOC_CHUNK		=	7, +	COMMIT_TRANS		=	8,  };  int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2767,6 +2822,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,  			     struct btrfs_block_rsv *block_rsv,  			     u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, +				  enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, +				       struct btrfs_block_rsv *src, +				       u64 num_bytes);  int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);  void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);  void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3141,7 +3203,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  			       struct inode *inode, u64 new_size,  			       u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,  			      unsigned int extra_bits, @@ -3150,9 +3212,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  			     struct btrfs_root *new_root,  			     struct btrfs_root *parent_root,  			     u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio, -			 unsigned long bio_flags); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +			       unsigned *bits); +void btrfs_clear_delalloc_extent(struct inode *inode, +				 struct extent_state *state, unsigned *bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +				 struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, +				 struct extent_state *orig, u64 split); +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, +			     unsigned long bio_flags);  void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);  int btrfs_readpage(struct file *file, struct page *page); @@ -3189,6 +3258,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,  				    struct btrfs_trans_handle *trans, int mode,  				    u64 start, u64 num_bytes, u64 min_size,  				    loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +		u64 start, u64 end, int *page_started, unsigned long *nr_written, +		struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +					  u64 end, int uptodate);  extern const struct dentry_operations btrfs_dentry_operations;  /* ioctl.c */ @@ -3428,6 +3503,16 @@ static inline void assfail(const char *expr, const char *file, int line)  #define ASSERT(expr)	((void)0)  #endif +/* + * Use that for functions that are conditionally exported for sanity tests but + * otherwise static + */ +#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define EXPORT_FOR_TESTS static +#else +#define EXPORT_FOR_TESTS +#endif +  __cold  static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)  { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9301b3ad9217..cad36c99a483 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,  	ref->in_tree = 0;  	btrfs_put_delayed_ref(ref);  	atomic_dec(&delayed_refs->num_entries); -	if (trans->delayed_ref_updates) -		trans->delayed_ref_updates--;  }  static bool merge_ref(struct btrfs_trans_handle *trans, @@ -400,6 +398,20 @@ again:  	return head;  } +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +			   struct btrfs_delayed_ref_head *head) +{ +	lockdep_assert_held(&delayed_refs->lock); +	lockdep_assert_held(&head->lock); + +	rb_erase_cached(&head->href_node, &delayed_refs->href_root); +	RB_CLEAR_NODE(&head->href_node); +	atomic_dec(&delayed_refs->num_entries); +	delayed_refs->num_heads--; +	if (head->processing == 0) +		delayed_refs->num_heads_ready--; +} +  /*   * Helper to insert the ref_node to the tail or merge with tail.   * @@ -453,7 +465,6 @@ inserted:  	if (ref->action == BTRFS_ADD_DELAYED_REF)  		list_add_tail(&ref->add_list, &href->ref_add_list);  	atomic_inc(&root->num_entries); -	trans->delayed_ref_updates++;  	spin_unlock(&href->lock);  	return ret;  } @@ -462,12 +473,14 @@ inserted:   * helper function to update the accounting in the head ref   * existing and update must have the same bytenr   */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,  			 struct btrfs_delayed_ref_head *existing,  			 struct btrfs_delayed_ref_head *update,  			 int *old_ref_mod_ret)  { +	struct btrfs_delayed_ref_root *delayed_refs = +		&trans->transaction->delayed_refs; +	struct btrfs_fs_info *fs_info = trans->fs_info;  	int old_ref_mod;  	BUG_ON(existing->is_data != update->is_data); @@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,  	 * versa we need to make sure to adjust pending_csums accordingly.  	 */  	if (existing->is_data) { -		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) +		u64 csum_leaves = +			btrfs_csum_bytes_to_leaves(fs_info, +						   existing->num_bytes); + +		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {  			delayed_refs->pending_csums -= existing->num_bytes; -		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) +			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); +		} +		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {  			delayed_refs->pending_csums += existing->num_bytes; +			trans->delayed_ref_updates += csum_leaves; +		}  	}  	spin_unlock(&existing->lock);  } @@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,  			&& head_ref->qgroup_reserved  			&& existing->qgroup_ref_root  			&& existing->qgroup_reserved); -		update_existing_head_ref(delayed_refs, existing, head_ref, +		update_existing_head_ref(trans, existing, head_ref,  					 old_ref_mod);  		/*  		 * we've updated the existing ref, free the newly @@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,  	} else {  		if (old_ref_mod)  			*old_ref_mod = 0; -		if (head_ref->is_data && head_ref->ref_mod < 0) +		if (head_ref->is_data && head_ref->ref_mod < 0) {  			delayed_refs->pending_csums += head_ref->num_bytes; +			trans->delayed_ref_updates += +				btrfs_csum_bytes_to_leaves(trans->fs_info, +							   head_ref->num_bytes); +		}  		delayed_refs->num_heads++;  		delayed_refs->num_heads_ready++;  		atomic_inc(&delayed_refs->num_entries); @@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,  	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);  	spin_unlock(&delayed_refs->lock); +	/* +	 * Need to update the delayed_refs_rsv with any changes we may have +	 * made. +	 */ +	btrfs_update_delayed_refs_rsv(trans); +  	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,  				   action == BTRFS_ADD_DELAYED_EXTENT ?  				   BTRFS_ADD_DELAYED_REF : action); @@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,  	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);  	spin_unlock(&delayed_refs->lock); +	/* +	 * Need to update the delayed_refs_rsv with any changes we may have +	 * made. +	 */ +	btrfs_update_delayed_refs_rsv(trans); +  	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,  				   action == BTRFS_ADD_DELAYED_EXTENT ?  				   BTRFS_ADD_DELAYED_REF : action); @@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,  			     NULL, NULL, NULL);  	spin_unlock(&delayed_refs->lock); + +	/* +	 * Need to update the delayed_refs_rsv with any changes we may have +	 * made. +	 */ +	btrfs_update_delayed_refs_rsv(trans);  	return 0;  } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 8e20c5cb5404..d2af974f68a1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)  {  	mutex_unlock(&head->mutex);  } - +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +			   struct btrfs_delayed_ref_head *head);  struct btrfs_delayed_ref_head *btrfs_select_ref_head(  		struct btrfs_delayed_ref_root *delayed_refs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..8750c835f535 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found:  			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;  		dev_replace->cont_reading_from_srcdev_mode =  		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; -		dev_replace->replace_state = 0;  		dev_replace->time_started = 0;  		dev_replace->time_stopped = 0;  		atomic64_set(&dev_replace->num_write_errors, 0); @@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	struct btrfs_dev_replace_item *ptr;  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; -	btrfs_dev_replace_read_lock(dev_replace); +	down_read(&dev_replace->rwsem);  	if (!dev_replace->is_valid ||  	    !dev_replace->item_needs_writeback) { -		btrfs_dev_replace_read_unlock(dev_replace); +		up_read(&dev_replace->rwsem);  		return 0;  	} -	btrfs_dev_replace_read_unlock(dev_replace); +	up_read(&dev_replace->rwsem);  	key.objectid = 0;  	key.type = BTRFS_DEV_REPLACE_KEY; @@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	ptr = btrfs_item_ptr(eb, path->slots[0],  			     struct btrfs_dev_replace_item); -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem);  	if (dev_replace->srcdev)  		btrfs_set_dev_replace_src_devid(eb, ptr,  			dev_replace->srcdev->devid); @@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	btrfs_set_dev_replace_cursor_right(eb, ptr,  		dev_replace->cursor_right);  	dev_replace->item_needs_writeback = 0; -	btrfs_dev_replace_write_unlock(dev_replace); +	up_write(&dev_replace->rwsem);  	btrfs_mark_buffer_dirty(eb); @@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)  		return rcu_str_deref(device->name);  } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,  		int read_src)  { @@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	if (IS_ERR(src_device))  		return PTR_ERR(src_device); +	if (btrfs_pinned_by_swapfile(fs_info, src_device)) { +		btrfs_warn_in_rcu(fs_info, +	  "cannot replace device %s (devid %llu) due to active swapfile", +			btrfs_dev_name(src_device), src_device->devid); +		return -ETXTBSY; +	} +  	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,  					    src_device, &tgt_device);  	if (ret) @@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	}  	need_unlock = true; -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	dev_replace->item_needs_writeback = 1;  	atomic64_set(&dev_replace->num_write_errors, 0);  	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); -	btrfs_dev_replace_write_unlock(dev_replace); +	up_write(&dev_replace->rwsem);  	need_unlock = false;  	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans);  		need_unlock = true; -		btrfs_dev_replace_write_lock(dev_replace); +		down_write(&dev_replace->rwsem);  		dev_replace->replace_state =  			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;  		dev_replace->srcdev = NULL; @@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	ret = btrfs_dev_replace_finishing(fs_info, ret);  	if (ret == -EINPROGRESS) {  		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; -	} else { +	} else if (ret != -ECANCELED) {  		WARN_ON(ret);  	} @@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  leave:  	if (need_unlock) -		btrfs_dev_replace_write_unlock(dev_replace); +		up_write(&dev_replace->rwsem);  	btrfs_destroy_dev_replace_tgtdev(tgt_device);  	return ret;  } @@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,  					args->start.cont_reading_from_srcdev_mode);  	args->result = ret;  	/* don't warn if EINPROGRESS, someone else might be running scrub */ -	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) -		ret = 0; +	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || +	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) +		return 0;  	return ret;  } @@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	/* don't allow cancel or unmount to disturb the finishing procedure */  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_read_lock(dev_replace); +	down_read(&dev_replace->rwsem);  	/* was the operation canceled, or is it finished? */  	if (dev_replace->replace_state !=  	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { -		btrfs_dev_replace_read_unlock(dev_replace); +		up_read(&dev_replace->rwsem);  		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  		return 0;  	}  	tgt_device = dev_replace->tgtdev;  	src_device = dev_replace->srcdev; -	btrfs_dev_replace_read_unlock(dev_replace); +	up_read(&dev_replace->rwsem);  	/*  	 * flush all outstanding I/O and inode extent mappings before the @@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	/* keep away write_all_supers() during the finishing procedure */  	mutex_lock(&fs_info->fs_devices->device_list_mutex);  	mutex_lock(&fs_info->chunk_mutex); -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem);  	dev_replace->replace_state =  		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED  			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  								src_device,  								tgt_device);  	} else { -		btrfs_err_in_rcu(fs_info, +		if (scrub_ret != -ECANCELED) +			btrfs_err_in_rcu(fs_info,  				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",  				 btrfs_dev_name(src_device),  				 src_device->devid,  				 rcu_str_deref(tgt_device->name), scrub_ret); -		btrfs_dev_replace_write_unlock(dev_replace); +		up_write(&dev_replace->rwsem);  		mutex_unlock(&fs_info->chunk_mutex);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex);  		btrfs_rm_dev_replace_blocked(fs_info); @@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);  	fs_info->fs_devices->rw_devices++; -	btrfs_dev_replace_write_unlock(dev_replace); - +	up_write(&dev_replace->rwsem);  	btrfs_rm_dev_replace_blocked(fs_info);  	btrfs_rm_dev_replace_remove_srcdev(src_device); @@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,  {  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; -	btrfs_dev_replace_read_lock(dev_replace); +	down_read(&dev_replace->rwsem);  	/* even if !dev_replace_is_valid, the values are good enough for  	 * the replace_status ioctl */  	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,  	args->status.num_uncorrectable_read_errors =  		atomic64_read(&dev_replace->num_uncorrectable_read_errors);  	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); -	btrfs_dev_replace_read_unlock(dev_replace); +	up_read(&dev_replace->rwsem);  }  int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) @@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)  		return -EROFS;  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:  		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; -		btrfs_dev_replace_write_unlock(dev_replace); -		goto leave; +		up_write(&dev_replace->rwsem); +		break;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +		tgt_device = dev_replace->tgtdev; +		src_device = dev_replace->srcdev; +		up_write(&dev_replace->rwsem); +		ret = btrfs_scrub_cancel(fs_info); +		if (ret < 0) { +			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; +		} else { +			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +			/* +			 * btrfs_dev_replace_finishing() will handle the +			 * cleanup part +			 */ +			btrfs_info_in_rcu(fs_info, +				"dev_replace from %s (devid %llu) to %s canceled", +				btrfs_dev_name(src_device), src_device->devid, +				btrfs_dev_name(tgt_device)); +		} +		break;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		/* +		 * Scrub doing the replace isn't running so we need to do the +		 * cleanup step of btrfs_dev_replace_finishing() here +		 */  		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;  		tgt_device = dev_replace->tgtdev;  		src_device = dev_replace->srcdev;  		dev_replace->tgtdev = NULL;  		dev_replace->srcdev = NULL; -		break; -	} -	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; -	dev_replace->time_stopped = ktime_get_real_seconds(); -	dev_replace->item_needs_writeback = 1; -	btrfs_dev_replace_write_unlock(dev_replace); -	btrfs_scrub_cancel(fs_info); +		dev_replace->replace_state = +				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; +		dev_replace->time_stopped = ktime_get_real_seconds(); +		dev_replace->item_needs_writeback = 1; -	trans = btrfs_start_transaction(root, 0); -	if (IS_ERR(trans)) { -		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); -		return PTR_ERR(trans); -	} -	ret = btrfs_commit_transaction(trans); -	WARN_ON(ret); +		up_write(&dev_replace->rwsem); -	btrfs_info_in_rcu(fs_info, -		"dev_replace from %s (devid %llu) to %s canceled", -		btrfs_dev_name(src_device), src_device->devid, -		btrfs_dev_name(tgt_device)); +		/* Scrub for replace must not be running in suspended state */ +		ret = btrfs_scrub_cancel(fs_info); +		ASSERT(ret != -ENOTCONN); + +		trans = btrfs_start_transaction(root, 0); +		if (IS_ERR(trans)) { +			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +			return PTR_ERR(trans); +		} +		ret = btrfs_commit_transaction(trans); +		WARN_ON(ret); -	if (tgt_device) -		btrfs_destroy_dev_replace_tgtdev(tgt_device); +		btrfs_info_in_rcu(fs_info, +		"suspended dev_replace from %s (devid %llu) to %s canceled", +			btrfs_dev_name(src_device), src_device->devid, +			btrfs_dev_name(tgt_device)); + +		if (tgt_device) +			btrfs_destroy_dev_replace_tgtdev(tgt_device); +		break; +	default: +		result = -EINVAL; +	} -leave:  	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  	return result;  } @@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem); +  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)  		break;  	} -	btrfs_dev_replace_write_unlock(dev_replace); +	up_write(&dev_replace->rwsem);  	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  } @@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)  	struct task_struct *task;  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; -	btrfs_dev_replace_write_lock(dev_replace); +	down_write(&dev_replace->rwsem); +  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: -		btrfs_dev_replace_write_unlock(dev_replace); +		up_write(&dev_replace->rwsem);  		return 0;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:  		break; @@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)  			   "cannot continue dev_replace, tgtdev is missing");  		btrfs_info(fs_info,  			   "you may cancel the operation after 'mount -o degraded'"); -		btrfs_dev_replace_write_unlock(dev_replace); +		dev_replace->replace_state = +					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; +		up_write(&dev_replace->rwsem);  		return 0;  	} -	btrfs_dev_replace_write_unlock(dev_replace); +	up_write(&dev_replace->rwsem);  	/*  	 * This could collide with a paused balance, but the exclusive op logic @@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)  	 * dev-replace to start anyway.  	 */  	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { +		down_write(&dev_replace->rwsem); +		dev_replace->replace_state = +					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; +		up_write(&dev_replace->rwsem);  		btrfs_info(fs_info,  		"cannot resume dev-replace, other exclusive operation running");  		return 0; @@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data)  			      btrfs_device_get_total_bytes(dev_replace->srcdev),  			      &dev_replace->scrub_progress, 0, 1);  	ret = btrfs_dev_replace_finishing(fs_info, ret); -	WARN_ON(ret); +	WARN_ON(ret && ret != -ECANCELED);  	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);  	return 0; @@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)  		 * something that can happen if the dev_replace  		 * procedure is suspended by an umount and then  		 * the tgtdev is missing (or "btrfs dev scan") was -		 * not called and the the filesystem is remounted +		 * not called and the filesystem is remounted  		 * in degraded state. This does not stop the  		 * dev_replace procedure. It needs to be canceled  		 * manually if the cancellation is wanted. @@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)  	return 1;  } -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) -{ -	read_lock(&dev_replace->lock); -} - -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) -{ -	read_unlock(&dev_replace->lock); -} - -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) -{ -again: -	wait_event(dev_replace->read_lock_wq, -		   atomic_read(&dev_replace->blocking_readers) == 0); -	write_lock(&dev_replace->lock); -	if (atomic_read(&dev_replace->blocking_readers)) { -		write_unlock(&dev_replace->lock); -		goto again; -	} -} - -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) -{ -	write_unlock(&dev_replace->lock); -} - -/* inc blocking cnt and release read lock */ -void btrfs_dev_replace_set_lock_blocking( -					struct btrfs_dev_replace *dev_replace) -{ -	/* only set blocking for read lock */ -	atomic_inc(&dev_replace->blocking_readers); -	read_unlock(&dev_replace->lock); -} -  void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)  {  	percpu_counter_inc(&fs_info->dev_replace.bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..4aa40bacc6cc 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  			  struct btrfs_fs_info *fs_info);  int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,  			    struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, -		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, -		int read_src);  void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,  			      struct btrfs_ioctl_dev_replace_args *args);  int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);  void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);  int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);  int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);  #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d776717d8b3..8da2f380d3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,  	len = buf->len - offset;  	while (len > 0) { +		/* +		 * Note: we don't need to check for the err == 1 case here, as +		 * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' +		 * and 'min_len = 32' and the currently implemented mapping +		 * algorithm we cannot cross a page boundary. +		 */  		err = map_private_extent_buffer(buf, offset, 32,  					&kaddr, &map_start, &map_len);  		if (err) @@ -542,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)  	if (WARN_ON(!PageUptodate(page)))  		return -EUCLEAN; -	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, +	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,  			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);  	return csum_tree_block(fs_info, eb, 0); @@ -557,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,  	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);  	while (fs_devices) { -		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { +		u8 *metadata_uuid; + +		/* +		 * Checking the incompat flag is only valid for the current +		 * fs. For seed devices it's forbidden to have their uuid +		 * changed so reading ->fsid in this case is fine +		 */ +		if (fs_devices == fs_info->fs_devices && +		    btrfs_fs_incompat(fs_info, METADATA_UUID)) +			metadata_uuid = fs_devices->metadata_uuid; +		else +			metadata_uuid = fs_devices->fsid; + +		if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {  			ret = 0;  			break;  		} @@ -660,19 +679,6 @@ out:  	return ret;  } -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ -	struct extent_buffer *eb; - -	eb = (struct extent_buffer *)page->private; -	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); -	eb->read_mirror = failed_mirror; -	atomic_dec(&eb->io_pages); -	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) -		btree_readahead_hook(eb, -EIO); -	return -EIO;	/* we fixed nothing */ -} -  static void end_workqueue_bio(struct bio *bio)  {  	struct btrfs_end_io_wq *end_io_wq = bio->bi_private; @@ -751,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work)  		async->status = ret;  } +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time.   All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */  static void run_one_async_done(struct btrfs_work *work)  {  	struct async_submit_bio *async; +	struct inode *inode; +	blk_status_t ret;  	async = container_of(work, struct  async_submit_bio, work); +	inode = async->private_data;  	/* If an error occurred we just want to clean up the bio and move on */  	if (async->status) { @@ -764,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work)  		return;  	} -	btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); +	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, +			async->mirror_num, 1); +	if (ret) { +		async->bio->bi_status = ret; +		bio_endio(async->bio); +	}  }  static void run_one_async_free(struct btrfs_work *work) @@ -1178,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,  	refcount_set(&root->refs, 1);  	atomic_set(&root->will_be_snapshotted, 0);  	atomic_set(&root->snapshot_force_cow, 0); +	atomic_set(&root->nr_swapfiles, 0);  	root->log_transid = 0;  	root->log_transid_committed = -1;  	root->last_log_commit = 0; @@ -2118,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)  {  	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); -	rwlock_init(&fs_info->dev_replace.lock); -	atomic_set(&fs_info->dev_replace.blocking_readers, 0); +	init_rwsem(&fs_info->dev_replace.rwsem);  	init_waitqueue_head(&fs_info->dev_replace.replace_wait); -	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);  }  static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2442,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info,  		ret = -EINVAL;  	} -	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { +	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, +		   BTRFS_FSID_SIZE) != 0) {  		btrfs_err(fs_info, -			   "dev_item UUID does not match fsid: %pU != %pU", -			   fs_info->fsid, sb->dev_item.fsid); +			"dev_item UUID does not match metadata fsid: %pU != %pU", +			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);  		ret = -EINVAL;  	} @@ -2656,6 +2678,9 @@ int open_ctree(struct super_block *sb,  	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);  	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,  			     BTRFS_BLOCK_RSV_DELOPS); +	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, +			     BTRFS_BLOCK_RSV_DELREFS); +  	atomic_set(&fs_info->async_delalloc_pages, 0);  	atomic_set(&fs_info->defrag_running, 0);  	atomic_set(&fs_info->qgroup_op_seq, 0); @@ -2745,6 +2770,9 @@ int open_ctree(struct super_block *sb,  	fs_info->sectorsize = 4096;  	fs_info->stripesize = 4096; +	spin_lock_init(&fs_info->swapfile_pins_lock); +	fs_info->swapfile_pins = RB_ROOT; +  	ret = btrfs_alloc_stripe_hash_table(fs_info);  	if (ret) {  		err = ret; @@ -2781,11 +2809,29 @@ int open_ctree(struct super_block *sb,  	 * the whole block of INFO_SIZE  	 */  	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); -	memcpy(fs_info->super_for_commit, fs_info->super_copy, -	       sizeof(*fs_info->super_for_commit));  	brelse(bh); -	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); +	disk_super = fs_info->super_copy; + +	ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, +		       BTRFS_FSID_SIZE)); + +	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { +		ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, +				fs_info->super_copy->metadata_uuid, +				BTRFS_FSID_SIZE)); +	} + +	features = btrfs_super_flags(disk_super); +	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { +		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; +		btrfs_set_super_flags(disk_super, features); +		btrfs_info(fs_info, +			"found metadata UUID change in progress flag, clearing"); +	} + +	memcpy(fs_info->super_for_commit, fs_info->super_copy, +	       sizeof(*fs_info->super_for_commit));  	ret = btrfs_validate_mount_super(fs_info);  	if (ret) { @@ -2794,7 +2840,6 @@ int open_ctree(struct super_block *sb,  		goto fail_alloc;  	} -	disk_super = fs_info->super_copy;  	if (!btrfs_super_root(disk_super))  		goto fail_alloc; @@ -2906,7 +2951,7 @@ int open_ctree(struct super_block *sb,  	sb->s_blocksize = sectorsize;  	sb->s_blocksize_bits = blksize_bits(sectorsize); -	memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); +	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);  	mutex_lock(&fs_info->chunk_mutex);  	ret = btrfs_read_sys_array(fs_info); @@ -3055,7 +3100,7 @@ retry_root_backup:  	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {  		btrfs_warn(fs_info, -		"writeable mount is not allowed due to too many missing devices"); +		"writable mount is not allowed due to too many missing devices");  		goto fail_sysfs;  	} @@ -3724,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)  		btrfs_set_stack_device_io_width(dev_item, dev->io_width);  		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);  		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); -		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); +		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, +		       BTRFS_FSID_SIZE);  		flags = btrfs_super_flags(sb);  		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -4031,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  	/*  	 * This is a fast path so only do this check if we have sanity tests -	 * enabled.  Normal people shouldn't be using umapped buffers as dirty +	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty  	 * outside of the sanity tests.  	 */  	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) @@ -4329,6 +4375,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,  	unpin = pinned_extents;  again:  	while (1) { +		struct extent_state *cached_state = NULL; +  		/*  		 * The btrfs_finish_extent_commit() may get the same range as  		 * ours between find_first_extent_bit and clear_extent_dirty. @@ -4337,13 +4385,14 @@ again:  		 */  		mutex_lock(&fs_info->unused_bg_unpin_mutex);  		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY, NULL); +					    EXTENT_DIRTY, &cached_state);  		if (ret) {  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			break;  		} -		clear_extent_dirty(unpin, start, end); +		clear_extent_dirty(unpin, start, end, &cached_state); +		free_extent_state(cached_state);  		btrfs_error_unpin_extent_range(fs_info, start, end);  		mutex_unlock(&fs_info->unused_bg_unpin_mutex);  		cond_resched(); @@ -4400,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,  		spin_unlock(&cur_trans->dirty_bgs_lock);  		btrfs_put_block_group(cache); +		btrfs_delayed_refs_rsv_release(fs_info, 1);  		spin_lock(&cur_trans->dirty_bgs_lock);  	}  	spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4505,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = {  	/* mandatory callbacks */  	.submit_bio_hook = btree_submit_bio_hook,  	.readpage_end_io_hook = btree_readpage_end_io_hook, -	.readpage_io_failed_hook = btree_io_failed_hook, - -	/* optional callbacks */  }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..987a64bc0c66 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,11 +21,11 @@  #define BTRFS_BDEV_BLOCKSIZE	(4096)  enum btrfs_wq_endio_type { -	BTRFS_WQ_ENDIO_DATA = 0, -	BTRFS_WQ_ENDIO_METADATA = 1, -	BTRFS_WQ_ENDIO_FREE_SPACE = 2, -	BTRFS_WQ_ENDIO_RAID56 = 3, -	BTRFS_WQ_ENDIO_DIO_REPAIR = 4, +	BTRFS_WQ_ENDIO_DATA, +	BTRFS_WQ_ENDIO_METADATA, +	BTRFS_WQ_ENDIO_FREE_SPACE, +	BTRFS_WQ_ENDIO_RAID56, +	BTRFS_WQ_ENDIO_DIO_REPAIR,  };  static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..b15afeae16df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -51,6 +51,24 @@ enum {  	CHUNK_ALLOC_FORCE = 2,  }; +/* + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name)					\ +static inline void update_##name(struct btrfs_space_info *sinfo,	\ +				 s64 bytes)				\ +{									\ +	if (bytes < 0 && sinfo->name < -bytes) {			\ +		WARN_ON(1);						\ +		sinfo->name = 0;					\ +		return;							\ +	}								\ +	sinfo->name += bytes;						\ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); +  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			       struct btrfs_delayed_ref_node *node, u64 parent,  			       u64 root_objectid, u64 owner_objectid, @@ -1037,7 +1055,7 @@ out_free:  /*   * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,   * is_data == BTRFS_REF_TYPE_ANY, either type is OK.   */  int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref  	btrfs_delayed_ref_unlock(head);  } -static int cleanup_extent_op(struct btrfs_trans_handle *trans, -			     struct btrfs_delayed_ref_head *head) +static struct btrfs_delayed_extent_op *cleanup_extent_op( +				struct btrfs_delayed_ref_head *head)  {  	struct btrfs_delayed_extent_op *extent_op = head->extent_op; -	int ret;  	if (!extent_op) -		return 0; -	head->extent_op = NULL; +		return NULL; +  	if (head->must_insert_reserved) { +		head->extent_op = NULL;  		btrfs_free_delayed_extent_op(extent_op); -		return 0; +		return NULL;  	} +	return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, +				     struct btrfs_delayed_ref_head *head) +{ +	struct btrfs_delayed_extent_op *extent_op; +	int ret; + +	extent_op = cleanup_extent_op(head); +	if (!extent_op) +		return 0; +	head->extent_op = NULL;  	spin_unlock(&head->lock);  	ret = run_delayed_extent_op(trans, head, extent_op);  	btrfs_free_delayed_extent_op(extent_op);  	return ret ? ret : 1;  } +static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, +					struct btrfs_delayed_ref_head *head) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_delayed_ref_root *delayed_refs = +		&trans->transaction->delayed_refs; +	int nr_items = 1;	/* Dropping this ref head update. */ + +	if (head->total_ref_mod < 0) { +		struct btrfs_space_info *space_info; +		u64 flags; + +		if (head->is_data) +			flags = BTRFS_BLOCK_GROUP_DATA; +		else if (head->is_system) +			flags = BTRFS_BLOCK_GROUP_SYSTEM; +		else +			flags = BTRFS_BLOCK_GROUP_METADATA; +		space_info = __find_space_info(fs_info, flags); +		ASSERT(space_info); +		percpu_counter_add_batch(&space_info->total_bytes_pinned, +				   -head->num_bytes, +				   BTRFS_TOTAL_BYTES_PINNED_BATCH); + +		/* +		 * We had csum deletions accounted for in our delayed refs rsv, +		 * we need to drop the csum leaves for this update from our +		 * delayed_refs_rsv. +		 */ +		if (head->is_data) { +			spin_lock(&delayed_refs->lock); +			delayed_refs->pending_csums -= head->num_bytes; +			spin_unlock(&delayed_refs->lock); +			nr_items += btrfs_csum_bytes_to_leaves(fs_info, +				head->num_bytes); +		} +	} + +	/* Also free its reserved qgroup space */ +	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, +				      head->qgroup_reserved); +	btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} +  static int cleanup_ref_head(struct btrfs_trans_handle *trans,  			    struct btrfs_delayed_ref_head *head)  { @@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,  	delayed_refs = &trans->transaction->delayed_refs; -	ret = cleanup_extent_op(trans, head); +	ret = run_and_cleanup_extent_op(trans, head);  	if (ret < 0) {  		unselect_delayed_ref_head(delayed_refs, head);  		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,  		spin_unlock(&delayed_refs->lock);  		return 1;  	} -	delayed_refs->num_heads--; -	rb_erase_cached(&head->href_node, &delayed_refs->href_root); -	RB_CLEAR_NODE(&head->href_node); +	btrfs_delete_ref_head(delayed_refs, head);  	spin_unlock(&head->lock);  	spin_unlock(&delayed_refs->lock); -	atomic_dec(&delayed_refs->num_entries); - -	trace_run_delayed_ref_head(fs_info, head, 0); - -	if (head->total_ref_mod < 0) { -		struct btrfs_space_info *space_info; -		u64 flags; - -		if (head->is_data) -			flags = BTRFS_BLOCK_GROUP_DATA; -		else if (head->is_system) -			flags = BTRFS_BLOCK_GROUP_SYSTEM; -		else -			flags = BTRFS_BLOCK_GROUP_METADATA; -		space_info = __find_space_info(fs_info, flags); -		ASSERT(space_info); -		percpu_counter_add_batch(&space_info->total_bytes_pinned, -				   -head->num_bytes, -				   BTRFS_TOTAL_BYTES_PINNED_BATCH); - -		if (head->is_data) { -			spin_lock(&delayed_refs->lock); -			delayed_refs->pending_csums -= head->num_bytes; -			spin_unlock(&delayed_refs->lock); -		} -	}  	if (head->must_insert_reserved) {  		btrfs_pin_extent(fs_info, head->bytenr, @@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,  		}  	} -	/* Also free its reserved qgroup space */ -	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, -				      head->qgroup_reserved); +	cleanup_ref_head_accounting(trans, head); + +	trace_run_delayed_ref_head(fs_info, head, 0);  	btrfs_delayed_ref_unlock(head);  	btrfs_put_delayed_ref_head(head);  	return 0; @@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)  	return num_csums;  } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)  { -	struct btrfs_fs_info *fs_info = trans->fs_info; -	struct btrfs_block_rsv *global_rsv; -	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; -	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; -	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; -	u64 num_bytes, num_dirty_bgs_bytes; -	int ret = 0; +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; +	bool ret = false; +	u64 reserved; -	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); -	num_heads = heads_to_leaves(fs_info, num_heads); -	if (num_heads > 1) -		num_bytes += (num_heads - 1) * fs_info->nodesize; -	num_bytes <<= 1; -	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * -							fs_info->nodesize; -	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, -							     num_dirty_bgs); -	global_rsv = &fs_info->global_block_rsv; +	spin_lock(&global_rsv->lock); +	reserved = global_rsv->reserved; +	spin_unlock(&global_rsv->lock);  	/* -	 * If we can't allocate any more chunks lets make sure we have _lots_ of -	 * wiggle room since running delayed refs can create more delayed refs. +	 * Since the global reserve is just kind of magic we don't really want +	 * to rely on it to save our bacon, so if our size is more than the +	 * delayed_refs_rsv and the global rsv then it's time to think about +	 * bailing.  	 */ -	if (global_rsv->space_info->full) { -		num_dirty_bgs_bytes <<= 1; -		num_bytes <<= 1; -	} - -	spin_lock(&global_rsv->lock); -	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) -		ret = 1; -	spin_unlock(&global_rsv->lock); +	spin_lock(&delayed_refs_rsv->lock); +	reserved += delayed_refs_rsv->reserved; +	if (delayed_refs_rsv->size >= reserved) +		ret = true; +	spin_unlock(&delayed_refs_rsv->lock);  	return ret;  } @@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)  	if (val >= NSEC_PER_SEC / 2)  		return 2; -	return btrfs_check_space_for_delayed_refs(trans); +	return btrfs_check_space_for_delayed_refs(trans->fs_info);  }  struct async_delayed_refs { @@ -3588,6 +3623,8 @@ again:  	 */  	mutex_lock(&trans->transaction->cache_write_mutex);  	while (!list_empty(&dirty)) { +		bool drop_reserve = true; +  		cache = list_first_entry(&dirty,  					 struct btrfs_block_group_cache,  					 dirty_list); @@ -3660,6 +3697,7 @@ again:  					list_add_tail(&cache->dirty_list,  						      &cur_trans->dirty_bgs);  					btrfs_get_block_group(cache); +					drop_reserve = false;  				}  				spin_unlock(&cur_trans->dirty_bgs_lock);  			} else if (ret) { @@ -3667,9 +3705,11 @@ again:  			}  		} -		/* if its not on the io list, we need to put the block group */ +		/* if it's not on the io list, we need to put the block group */  		if (should_put)  			btrfs_put_block_group(cache); +		if (drop_reserve) +			btrfs_delayed_refs_rsv_release(fs_info, 1);  		if (ret)  			break; @@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,  		/* if its not on the io list, we need to put the block group */  		if (should_put)  			btrfs_put_block_group(cache); +		btrfs_delayed_refs_rsv_release(fs_info, 1);  		spin_lock(&cur_trans->dirty_bgs_lock);  	}  	spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4256,7 +4297,7 @@ commit_trans:  					      data_sinfo->flags, bytes, 1);  		return -ENOSPC;  	} -	data_sinfo->bytes_may_use += bytes; +	update_bytes_may_use(data_sinfo, bytes);  	trace_btrfs_space_reservation(fs_info, "space_info",  				      data_sinfo->flags, bytes, 1);  	spin_unlock(&data_sinfo->lock); @@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,  	data_sinfo = fs_info->data_sinfo;  	spin_lock(&data_sinfo->lock); -	if (WARN_ON(data_sinfo->bytes_may_use < len)) -		data_sinfo->bytes_may_use = 0; -	else -		data_sinfo->bytes_may_use -= len; +	update_bytes_may_use(data_sinfo, -len);  	trace_btrfs_space_reservation(fs_info, "space_info",  				      data_sinfo->flags, len, 0);  	spin_unlock(&data_sinfo->lock); @@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,  	/*  	 * If we have dup, raid1 or raid10 then only half of the free -	 * space is actually useable.  For raid56, the space info used +	 * space is actually usable.  For raid56, the space info used  	 * doesn't include the parity drive, so we don't have to  	 * change the math  	 */ @@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,  {  	struct reserve_ticket *ticket = NULL;  	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;  	struct btrfs_trans_handle *trans; -	u64 bytes; +	u64 bytes_needed; +	u64 reclaim_bytes = 0;  	trans = (struct btrfs_trans_handle *)current->journal_info;  	if (trans) @@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,  	else if (!list_empty(&space_info->tickets))  		ticket = list_first_entry(&space_info->tickets,  					  struct reserve_ticket, list); -	bytes = (ticket) ? ticket->bytes : 0; +	bytes_needed = (ticket) ? ticket->bytes : 0;  	spin_unlock(&space_info->lock); -	if (!bytes) +	if (!bytes_needed)  		return 0;  	/* See if there is enough pinned space to make this reservation */  	if (__percpu_counter_compare(&space_info->total_bytes_pinned, -				   bytes, +				   bytes_needed,  				   BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)  		goto commit; @@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,  		return -ENOSPC;  	spin_lock(&delayed_rsv->lock); -	if (delayed_rsv->size > bytes) -		bytes = 0; -	else -		bytes -= delayed_rsv->size; +	reclaim_bytes += delayed_rsv->reserved;  	spin_unlock(&delayed_rsv->lock); +	spin_lock(&delayed_refs_rsv->lock); +	reclaim_bytes += delayed_refs_rsv->reserved; +	spin_unlock(&delayed_refs_rsv->lock); +	if (reclaim_bytes >= bytes_needed) +		goto commit; +	bytes_needed -= reclaim_bytes; +  	if (__percpu_counter_compare(&space_info->total_bytes_pinned, -				   bytes, +				   bytes_needed,  				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {  		return -ENOSPC;  	} @@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info,  		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,  				state == FLUSH_DELALLOC_WAIT);  		break; +	case FLUSH_DELAYED_REFS_NR: +	case FLUSH_DELAYED_REFS: +		trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			break; +		} +		if (state == FLUSH_DELAYED_REFS_NR) +			nr = calc_reclaim_items_nr(fs_info, num_bytes); +		else +			nr = 0; +		btrfs_run_delayed_refs(trans, nr); +		btrfs_end_transaction(trans); +		break;  	case ALLOC_CHUNK:  		trans = btrfs_join_transaction(root);  		if (IS_ERR(trans)) { @@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,  		list_del_init(&ticket->list);  	if (ticket->bytes && ticket->bytes < orig_bytes) {  		u64 num_bytes = orig_bytes - ticket->bytes; -		space_info->bytes_may_use -= num_bytes; +		update_bytes_may_use(space_info, -num_bytes);  		trace_btrfs_space_reservation(fs_info, "space_info",  					      space_info->flags, num_bytes, 0);  	} @@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,  	 * If not things get more complicated.  	 */  	if (used + orig_bytes <= space_info->total_bytes) { -		space_info->bytes_may_use += orig_bytes; +		update_bytes_may_use(space_info, orig_bytes);  		trace_btrfs_space_reservation(fs_info, "space_info",  					      space_info->flags, orig_bytes, 1);  		ret = 0;  	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,  				  system_chunk)) { -		space_info->bytes_may_use += orig_bytes; +		update_bytes_may_use(space_info, orig_bytes);  		trace_btrfs_space_reservation(fs_info, "space_info",  					      space_info->flags, orig_bytes, 1);  		ret = 0; @@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,  	if (ticket.bytes) {  		if (ticket.bytes < orig_bytes) {  			u64 num_bytes = orig_bytes - ticket.bytes; -			space_info->bytes_may_use -= num_bytes; +			update_bytes_may_use(space_info, -num_bytes);  			trace_btrfs_space_reservation(fs_info, "space_info",  						      space_info->flags,  						      num_bytes, 0); @@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,   * @orig_bytes - the number of bytes we want   * @flush - whether or not we can flush to make our reservation   * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated   * with the block_rsv.  If there is not enough space it will make an attempt to   * flush out space to make room.  It will do this by flushing delalloc if   * possible or committing the transaction.  If flush is 0 then no attempts to @@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,  	return 0;  } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv.  Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, +				       struct btrfs_block_rsv *src, +				       u64 num_bytes) +{ +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; +	u64 to_free = 0; + +	spin_lock(&src->lock); +	src->reserved -= num_bytes; +	src->size -= num_bytes; +	spin_unlock(&src->lock); + +	spin_lock(&delayed_refs_rsv->lock); +	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { +		u64 delta = delayed_refs_rsv->size - +			delayed_refs_rsv->reserved; +		if (num_bytes > delta) { +			to_free = num_bytes - delta; +			num_bytes = delta; +		} +	} else { +		to_free = num_bytes; +		num_bytes = 0; +	} + +	if (num_bytes) +		delayed_refs_rsv->reserved += num_bytes; +	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) +		delayed_refs_rsv->full = 1; +	spin_unlock(&delayed_refs_rsv->lock); + +	if (num_bytes) +		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", +					      0, num_bytes, 1); +	if (to_free) +		space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, +					 to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, +				  enum btrfs_reserve_flush_enum flush) +{ +	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; +	u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); +	u64 num_bytes = 0; +	int ret = -ENOSPC; + +	spin_lock(&block_rsv->lock); +	if (block_rsv->reserved < block_rsv->size) { +		num_bytes = block_rsv->size - block_rsv->reserved; +		num_bytes = min(num_bytes, limit); +	} +	spin_unlock(&block_rsv->lock); + +	if (!num_bytes) +		return 0; + +	ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, +				     num_bytes, flush); +	if (ret) +		return ret; +	block_rsv_add_bytes(block_rsv, num_bytes, 0); +	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", +				      0, num_bytes, 1); +	return 0; +} +  /*   * This is for space we already have accounted in space_info->bytes_may_use, so   * basically when we're returning space from block_rsv's. @@ -5407,7 +5549,7 @@ again:  		flush = BTRFS_RESERVE_FLUSH_ALL;  		goto again;  	} -	space_info->bytes_may_use -= num_bytes; +	update_bytes_may_use(space_info, -num_bytes);  	trace_btrfs_space_reservation(fs_info, "space_info",  				      space_info->flags, num_bytes, 0);  	spin_unlock(&space_info->lock); @@ -5435,7 +5577,7 @@ again:  						      ticket->bytes, 1);  			list_del_init(&ticket->list);  			num_bytes -= ticket->bytes; -			space_info->bytes_may_use += ticket->bytes; +			update_bytes_may_use(space_info, ticket->bytes);  			ticket->bytes = 0;  			space_info->tickets_id++;  			wake_up(&ticket->wait); @@ -5443,7 +5585,7 @@ again:  			trace_btrfs_space_reservation(fs_info, "space_info",  						      space_info->flags,  						      num_bytes, 1); -			space_info->bytes_may_use += num_bytes; +			update_bytes_may_use(space_info, num_bytes);  			ticket->bytes -= num_bytes;  			num_bytes = 0;  		} @@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,  /**   * btrfs_inode_rsv_refill - refill the inode block rsv.   * @inode - the inode we are refilling. - * @flush - the flusing restriction. + * @flush - the flushing restriction.   *   * Essentially the same as btrfs_block_rsv_refill, except it uses the   * block_rsv->size as the minimum size.  We'll either refill the missing amount - * or return if we already have enough space.  This will also handle the resreve + * or return if we already have enough space.  This will also handle the reserve   * tracepoint for the reserved amount.   */  static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, @@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,  	return ret;  } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +				     struct btrfs_block_rsv *block_rsv, +				     u64 num_bytes, u64 *qgroup_to_release) +{ +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; +	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; +	struct btrfs_block_rsv *target = delayed_rsv; + +	if (target->full || target == block_rsv) +		target = global_rsv; + +	if (block_rsv->space_info != target->space_info) +		target = NULL; + +	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, +				       qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +			     struct btrfs_block_rsv *block_rsv, +			     u64 num_bytes) +{ +	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} +  /**   * btrfs_inode_rsv_release - release any excessive reservation.   * @inode - the inode we need to release from. @@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,  static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;  	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;  	u64 released = 0;  	u64 qgroup_to_release = 0; @@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)  	 * are releasing 0 bytes, and then we'll just get the reservation over  	 * the size free'd.  	 */ -	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, -					   &qgroup_to_release); +	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, +					     &qgroup_to_release);  	if (released > 0)  		trace_btrfs_space_reservation(fs_info, "delalloc",  					      btrfs_ino(inode), released, 0); @@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)  						   qgroup_to_release);  } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, -			     struct btrfs_block_rsv *block_rsv, -			     u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)  { +	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; +	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); +	u64 released = 0; -	if (global_rsv == block_rsv || -	    block_rsv->space_info != global_rsv->space_info) -		global_rsv = NULL; -	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); +	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, +					   num_bytes, NULL); +	if (released) +		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", +					      0, released, 0);  }  static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  			num_bytes = min(num_bytes,  					block_rsv->size - block_rsv->reserved);  			block_rsv->reserved += num_bytes; -			sinfo->bytes_may_use += num_bytes; +			update_bytes_may_use(sinfo, num_bytes);  			trace_btrfs_space_reservation(fs_info, "space_info",  						      sinfo->flags, num_bytes,  						      1);  		}  	} else if (block_rsv->reserved > block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size; -		sinfo->bytes_may_use -= num_bytes; +		update_bytes_may_use(sinfo, -num_bytes);  		trace_btrfs_space_reservation(fs_info, "space_info",  				      sinfo->flags, num_bytes, 0);  		block_rsv->reserved = block_rsv->size; @@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)  	fs_info->trans_block_rsv.space_info = space_info;  	fs_info->empty_block_rsv.space_info = space_info;  	fs_info->delayed_block_rsv.space_info = space_info; +	fs_info->delayed_refs_rsv.space_info = space_info; -	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; -	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; +	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; +	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;  	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;  	if (fs_info->quota_root) @@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)  	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);  	WARN_ON(fs_info->delayed_block_rsv.size > 0);  	WARN_ON(fs_info->delayed_block_rsv.reserved > 0); +	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); +	WARN_ON(fs_info->delayed_refs_rsv.size > 0);  } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; +	u64 num_bytes; + +	if (!trans->delayed_ref_updates) +		return; + +	num_bytes = btrfs_calc_trans_metadata_size(fs_info, +						   trans->delayed_ref_updates); +	spin_lock(&delayed_rsv->lock); +	delayed_rsv->size += num_bytes; +	delayed_rsv->full = 0; +	spin_unlock(&delayed_rsv->lock); +	trans->delayed_ref_updates = 0; +}  /*   * To be called after all the new block groups attached to the transaction @@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  	u64 old_val;  	u64 byte_in_group;  	int factor; +	int ret = 0;  	/* block accounting for super block */  	spin_lock(&info->delalloc_root_lock); @@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,  	while (total) {  		cache = btrfs_lookup_block_group(info, bytenr); -		if (!cache) -			return -ENOENT; +		if (!cache) { +			ret = -ENOENT; +			break; +		}  		factor = btrfs_bg_type_to_factor(cache->flags);  		/* @@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  			old_val -= num_bytes;  			btrfs_set_block_group_used(&cache->item, old_val);  			cache->pinned += num_bytes; -			cache->space_info->bytes_pinned += num_bytes; +			update_bytes_pinned(cache->space_info, num_bytes);  			cache->space_info->bytes_used -= num_bytes;  			cache->space_info->disk_used -= num_bytes * factor;  			spin_unlock(&cache->lock); @@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  			list_add_tail(&cache->dirty_list,  				      &trans->transaction->dirty_bgs);  			trans->transaction->num_dirty_bgs++; +			trans->delayed_ref_updates++;  			btrfs_get_block_group(cache);  		}  		spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,  		total -= num_bytes;  		bytenr += num_bytes;  	} -	return 0; + +	/* Modified block groups are accounted for in the delayed_refs_rsv. */ +	btrfs_update_delayed_refs_rsv(trans); +	return ret;  }  static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,  	spin_lock(&cache->space_info->lock);  	spin_lock(&cache->lock);  	cache->pinned += num_bytes; -	cache->space_info->bytes_pinned += num_bytes; +	update_bytes_pinned(cache->space_info, num_bytes);  	if (reserved) {  		cache->reserved -= num_bytes;  		cache->space_info->bytes_reserved -= num_bytes; @@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,  	} else {  		cache->reserved += num_bytes;  		space_info->bytes_reserved += num_bytes; -		space_info->bytes_may_use -= ram_bytes; +		update_bytes_may_use(space_info, -ram_bytes);  		if (delalloc)  			cache->delalloc_bytes += num_bytes;  	} @@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,  		spin_lock(&space_info->lock);  		spin_lock(&cache->lock);  		cache->pinned -= len; -		space_info->bytes_pinned -= len; +		update_bytes_pinned(space_info, -len);  		trace_btrfs_space_reservation(fs_info, "pinned",  					      space_info->flags, len, 0); @@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,  				to_add = min(len, global_rsv->size -  					     global_rsv->reserved);  				global_rsv->reserved += to_add; -				space_info->bytes_may_use += to_add; +				update_bytes_may_use(space_info, to_add);  				if (global_rsv->reserved >= global_rsv->size)  					global_rsv->full = 1;  				trace_btrfs_space_reservation(fs_info, @@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  		unpin = &fs_info->freed_extents[0];  	while (!trans->aborted) { +		struct extent_state *cached_state = NULL; +  		mutex_lock(&fs_info->unused_bg_unpin_mutex);  		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY, NULL); +					    EXTENT_DIRTY, &cached_state);  		if (ret) {  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			break; @@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  			ret = btrfs_discard_extent(fs_info, start,  						   end + 1 - start, NULL); -		clear_extent_dirty(unpin, start, end); +		clear_extent_dirty(unpin, start, end, &cached_state);  		unpin_extent_range(fs_info, start, end, true);  		mutex_unlock(&fs_info->unused_bg_unpin_mutex); +		free_extent_state(cached_state);  		cond_resched();  	} @@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))  		goto out; -	if (head->extent_op) { -		if (!head->must_insert_reserved) -			goto out; -		btrfs_free_delayed_extent_op(head->extent_op); -		head->extent_op = NULL; -	} +	if (cleanup_extent_op(head) != NULL) +		goto out;  	/*  	 * waiting for the lock here would deadlock.  If someone else has it @@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	if (!mutex_trylock(&head->mutex))  		goto out; -	/* -	 * at this point we have a head with no other entries.  Go -	 * ahead and process it. -	 */ -	rb_erase_cached(&head->href_node, &delayed_refs->href_root); -	RB_CLEAR_NODE(&head->href_node); -	atomic_dec(&delayed_refs->num_entries); - -	/* -	 * we don't take a ref on the node because we're removing it from the -	 * tree, so we just steal the ref the tree was holding. -	 */ -	delayed_refs->num_heads--; -	if (head->processing == 0) -		delayed_refs->num_heads_ready--; +	btrfs_delete_ref_head(delayed_refs, head);  	head->processing = 0; +  	spin_unlock(&head->lock);  	spin_unlock(&delayed_refs->lock); @@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	if (head->must_insert_reserved)  		ret = 1; +	cleanup_ref_head_accounting(trans, head);  	mutex_unlock(&head->mutex);  	btrfs_put_delayed_ref_head(head);  	return ret; @@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,  }  /* + * Structure used internally for find_free_extent() function.  Wraps needed + * parameters. + */ +struct find_free_extent_ctl { +	/* Basic allocation info */ +	u64 ram_bytes; +	u64 num_bytes; +	u64 empty_size; +	u64 flags; +	int delalloc; + +	/* Where to start the search inside the bg */ +	u64 search_start; + +	/* For clustered allocation */ +	u64 empty_cluster; + +	bool have_caching_bg; +	bool orig_have_caching_bg; + +	/* RAID index, converted from flags */ +	int index; + +	/* +	 * Current loop number, check find_free_extent_update_loop() for details +	 */ +	int loop; + +	/* +	 * Whether we're refilling a cluster, if true we need to re-search +	 * current block group but don't try to refill the cluster again. +	 */ +	bool retry_clustered; + +	/* +	 * Whether we're updating free space cache, if true we need to re-search +	 * current block group but don't try updating free space cache again. +	 */ +	bool retry_unclustered; + +	/* If current block group is cached */ +	int cached; + +	/* Max contiguous hole found */ +	u64 max_extent_size; + +	/* Total free space from free space cache, not always contiguous */ +	u64 total_free_space; + +	/* Found result */ +	u64 found_offset; +}; + + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, +		struct btrfs_free_cluster *last_ptr, +		struct find_free_extent_ctl *ffe_ctl, +		struct btrfs_block_group_cache **cluster_bg_ret) +{ +	struct btrfs_fs_info *fs_info = bg->fs_info; +	struct btrfs_block_group_cache *cluster_bg; +	u64 aligned_cluster; +	u64 offset; +	int ret; + +	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); +	if (!cluster_bg) +		goto refill_cluster; +	if (cluster_bg != bg && (cluster_bg->ro || +	    !block_group_bits(cluster_bg, ffe_ctl->flags))) +		goto release_cluster; + +	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, +			ffe_ctl->num_bytes, cluster_bg->key.objectid, +			&ffe_ctl->max_extent_size); +	if (offset) { +		/* We have a block, we're done */ +		spin_unlock(&last_ptr->refill_lock); +		trace_btrfs_reserve_extent_cluster(cluster_bg, +				ffe_ctl->search_start, ffe_ctl->num_bytes); +		*cluster_bg_ret = cluster_bg; +		ffe_ctl->found_offset = offset; +		return 0; +	} +	WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: +	/* +	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so +	 * lets just skip it and let the allocator find whatever block it can +	 * find. If we reach this point, we will have tried the cluster +	 * allocator plenty of times and not have found anything, so we are +	 * likely way too fragmented for the clustering stuff to find anything. +	 * +	 * However, if the cluster is taken from the current block group, +	 * release the cluster first, so that we stand a better chance of +	 * succeeding in the unclustered allocation. +	 */ +	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { +		spin_unlock(&last_ptr->refill_lock); +		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); +		return -ENOENT; +	} + +	/* This cluster didn't work out, free it and start over */ +	btrfs_return_cluster_to_free_space(NULL, last_ptr); + +	if (cluster_bg != bg) +		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: +	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { +		spin_unlock(&last_ptr->refill_lock); +		return -ENOENT; +	} + +	aligned_cluster = max_t(u64, +			ffe_ctl->empty_cluster + ffe_ctl->empty_size, +			bg->full_stripe_len); +	ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, +			ffe_ctl->search_start, ffe_ctl->num_bytes, +			aligned_cluster); +	if (ret == 0) { +		/* Now pull our allocation out of this cluster */ +		offset = btrfs_alloc_from_cluster(bg, last_ptr, +				ffe_ctl->num_bytes, ffe_ctl->search_start, +				&ffe_ctl->max_extent_size); +		if (offset) { +			/* We found one, proceed */ +			spin_unlock(&last_ptr->refill_lock); +			trace_btrfs_reserve_extent_cluster(bg, +					ffe_ctl->search_start, +					ffe_ctl->num_bytes); +			ffe_ctl->found_offset = offset; +			return 0; +		} +	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && +		   !ffe_ctl->retry_clustered) { +		spin_unlock(&last_ptr->refill_lock); + +		ffe_ctl->retry_clustered = true; +		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +				ffe_ctl->empty_cluster + ffe_ctl->empty_size); +		return -EAGAIN; +	} +	/* +	 * At this point we either didn't find a cluster or we weren't able to +	 * allocate a block from our cluster.  Free the cluster we've been +	 * trying to use, and go to the next block group. +	 */ +	btrfs_return_cluster_to_free_space(NULL, last_ptr); +	spin_unlock(&last_ptr->refill_lock); +	return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group + */ +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, +		struct btrfs_free_cluster *last_ptr, +		struct find_free_extent_ctl *ffe_ctl) +{ +	u64 offset; + +	/* +	 * We are doing an unclustered allocation, set the fragmented flag so +	 * we don't bother trying to setup a cluster again until we get more +	 * space. +	 */ +	if (unlikely(last_ptr)) { +		spin_lock(&last_ptr->lock); +		last_ptr->fragmented = 1; +		spin_unlock(&last_ptr->lock); +	} +	if (ffe_ctl->cached) { +		struct btrfs_free_space_ctl *free_space_ctl; + +		free_space_ctl = bg->free_space_ctl; +		spin_lock(&free_space_ctl->tree_lock); +		if (free_space_ctl->free_space < +		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster + +		    ffe_ctl->empty_size) { +			ffe_ctl->total_free_space = max_t(u64, +					ffe_ctl->total_free_space, +					free_space_ctl->free_space); +			spin_unlock(&free_space_ctl->tree_lock); +			return 1; +		} +		spin_unlock(&free_space_ctl->tree_lock); +	} + +	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, +			ffe_ctl->num_bytes, ffe_ctl->empty_size, +			&ffe_ctl->max_extent_size); + +	/* +	 * If we didn't find a chunk, and we haven't failed on this block group +	 * before, and this block group is in the middle of caching and we are +	 * ok with waiting, then go ahead and wait for progress to be made, and +	 * set @retry_unclustered to true. +	 * +	 * If @retry_unclustered is true then we've already waited on this +	 * block group once and should move on to the next block group. +	 */ +	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && +	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) { +		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +						ffe_ctl->empty_size); +		ffe_ctl->retry_unclustered = true; +		return -EAGAIN; +	} else if (!offset) { +		return 1; +	} +	ffe_ctl->found_offset = offset; +	return 0; +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, +					struct btrfs_free_cluster *last_ptr, +					struct btrfs_key *ins, +					struct find_free_extent_ctl *ffe_ctl, +					int full_search, bool use_cluster) +{ +	struct btrfs_root *root = fs_info->extent_root; +	int ret; + +	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && +	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) +		ffe_ctl->orig_have_caching_bg = true; + +	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && +	    ffe_ctl->have_caching_bg) +		return 1; + +	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) +		return 1; + +	if (ins->objectid) { +		if (!use_cluster && last_ptr) { +			spin_lock(&last_ptr->lock); +			last_ptr->window_start = ins->objectid; +			spin_unlock(&last_ptr->lock); +		} +		return 0; +	} + +	/* +	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking +	 *			caching kthreads as we move along +	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching +	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again +	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try +	 *		       again +	 */ +	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { +		ffe_ctl->index = 0; +		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { +			/* +			 * We want to skip the LOOP_CACHING_WAIT step if we +			 * don't have any uncached bgs and we've already done a +			 * full search through. +			 */ +			if (ffe_ctl->orig_have_caching_bg || !full_search) +				ffe_ctl->loop = LOOP_CACHING_WAIT; +			else +				ffe_ctl->loop = LOOP_ALLOC_CHUNK; +		} else { +			ffe_ctl->loop++; +		} + +		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { +			struct btrfs_trans_handle *trans; +			int exist = 0; + +			trans = current->journal_info; +			if (trans) +				exist = 1; +			else +				trans = btrfs_join_transaction(root); + +			if (IS_ERR(trans)) { +				ret = PTR_ERR(trans); +				return ret; +			} + +			ret = do_chunk_alloc(trans, ffe_ctl->flags, +					     CHUNK_ALLOC_FORCE); + +			/* +			 * If we can't allocate a new chunk we've already looped +			 * through at least once, move on to the NO_EMPTY_SIZE +			 * case. +			 */ +			if (ret == -ENOSPC) +				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + +			/* Do not bail out on ENOSPC since we can do more. */ +			if (ret < 0 && ret != -ENOSPC) +				btrfs_abort_transaction(trans, ret); +			else +				ret = 0; +			if (!exist) +				btrfs_end_transaction(trans); +			if (ret) +				return ret; +		} + +		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { +			/* +			 * Don't loop again if we already have no empty_size and +			 * no empty_cluster. +			 */ +			if (ffe_ctl->empty_size == 0 && +			    ffe_ctl->empty_cluster == 0) +				return -ENOSPC; +			ffe_ctl->empty_size = 0; +			ffe_ctl->empty_cluster = 0; +		} +		return 1; +	} +	return -ENOSPC; +} + +/*   * walks the btree of allocated extents and find a hole of a given size.   * The key ins is changed to record the hole:   * ins->objectid == start position @@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,   *   * If there is no suitable free space, we will record the max size of   * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * |  |- Get a valid block group + * |  |- Try to do clustered allocation in that block group + * |  |- Try to do unclustered allocation in that block group + * |  |- Check if the result is valid + * |  |  |- If valid, then exit + * |  |- Jump to next block group + * | + * |- Push harder to find free extents + *    |- If not found, re-iterate all block groups   */  static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  				u64 ram_bytes, u64 num_bytes, u64 empty_size, @@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  				u64 flags, int delalloc)  {  	int ret = 0; -	struct btrfs_root *root = fs_info->extent_root;  	struct btrfs_free_cluster *last_ptr = NULL;  	struct btrfs_block_group_cache *block_group = NULL; -	u64 search_start = 0; -	u64 max_extent_size = 0; -	u64 max_free_space = 0; -	u64 empty_cluster = 0; +	struct find_free_extent_ctl ffe_ctl = {0};  	struct btrfs_space_info *space_info; -	int loop = 0; -	int index = btrfs_bg_flags_to_raid_index(flags); -	bool failed_cluster_refill = false; -	bool failed_alloc = false;  	bool use_cluster = true; -	bool have_caching_bg = false; -	bool orig_have_caching_bg = false;  	bool full_search = false;  	WARN_ON(num_bytes < fs_info->sectorsize); + +	ffe_ctl.ram_bytes = ram_bytes; +	ffe_ctl.num_bytes = num_bytes; +	ffe_ctl.empty_size = empty_size; +	ffe_ctl.flags = flags; +	ffe_ctl.search_start = 0; +	ffe_ctl.retry_clustered = false; +	ffe_ctl.retry_unclustered = false; +	ffe_ctl.delalloc = delalloc; +	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); +	ffe_ctl.have_caching_bg = false; +	ffe_ctl.orig_have_caching_bg = false; +	ffe_ctl.found_offset = 0; +  	ins->type = BTRFS_EXTENT_ITEM_KEY;  	ins->objectid = 0;  	ins->offset = 0; @@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  		spin_unlock(&space_info->lock);  	} -	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); +	last_ptr = fetch_cluster_info(fs_info, space_info, +				      &ffe_ctl.empty_cluster);  	if (last_ptr) {  		spin_lock(&last_ptr->lock);  		if (last_ptr->block_group) @@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  		spin_unlock(&last_ptr->lock);  	} -	search_start = max(search_start, first_logical_byte(fs_info, 0)); -	search_start = max(search_start, hint_byte); -	if (search_start == hint_byte) { -		block_group = btrfs_lookup_block_group(fs_info, search_start); +	ffe_ctl.search_start = max(ffe_ctl.search_start, +				   first_logical_byte(fs_info, 0)); +	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); +	if (ffe_ctl.search_start == hint_byte) { +		block_group = btrfs_lookup_block_group(fs_info, +						       ffe_ctl.search_start);  		/*  		 * we don't want to use the block group if it doesn't match our  		 * allocation bits, or if its not cached. @@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  				btrfs_put_block_group(block_group);  				up_read(&space_info->groups_sem);  			} else { -				index = btrfs_bg_flags_to_raid_index( +				ffe_ctl.index = btrfs_bg_flags_to_raid_index(  						block_group->flags);  				btrfs_lock_block_group(block_group, delalloc);  				goto have_block_group; @@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  		}  	}  search: -	have_caching_bg = false; -	if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) +	ffe_ctl.have_caching_bg = false; +	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || +	    ffe_ctl.index == 0)  		full_search = true;  	down_read(&space_info->groups_sem); -	list_for_each_entry(block_group, &space_info->block_groups[index], -			    list) { -		u64 offset; -		int cached; - +	list_for_each_entry(block_group, +			    &space_info->block_groups[ffe_ctl.index], list) {  		/* If the block group is read-only, we can skip it entirely. */  		if (unlikely(block_group->ro))  			continue;  		btrfs_grab_block_group(block_group, delalloc); -		search_start = block_group->key.objectid; +		ffe_ctl.search_start = block_group->key.objectid;  		/*  		 * this can happen if we end up cycling through all the @@ -7398,9 +7953,9 @@ search:  		}  have_block_group: -		cached = block_group_cache_done(block_group); -		if (unlikely(!cached)) { -			have_caching_bg = true; +		ffe_ctl.cached = block_group_cache_done(block_group); +		if (unlikely(!ffe_ctl.cached)) { +			ffe_ctl.have_caching_bg = true;  			ret = cache_block_group(block_group, 0);  			BUG_ON(ret < 0);  			ret = 0; @@ -7414,322 +7969,92 @@ have_block_group:  		 * lets look there  		 */  		if (last_ptr && use_cluster) { -			struct btrfs_block_group_cache *used_block_group; -			unsigned long aligned_cluster; -			/* -			 * the refill lock keeps out other -			 * people trying to start a new cluster -			 */ -			used_block_group = btrfs_lock_cluster(block_group, -							      last_ptr, -							      delalloc); -			if (!used_block_group) -				goto refill_cluster; - -			if (used_block_group != block_group && -			    (used_block_group->ro || -			     !block_group_bits(used_block_group, flags))) -				goto release_cluster; - -			offset = btrfs_alloc_from_cluster(used_block_group, -						last_ptr, -						num_bytes, -						used_block_group->key.objectid, -						&max_extent_size); -			if (offset) { -				/* we have a block, we're done */ -				spin_unlock(&last_ptr->refill_lock); -				trace_btrfs_reserve_extent_cluster( -						used_block_group, -						search_start, num_bytes); -				if (used_block_group != block_group) { -					btrfs_release_block_group(block_group, -								  delalloc); -					block_group = used_block_group; -				} -				goto checks; -			} - -			WARN_ON(last_ptr->block_group != used_block_group); -release_cluster: -			/* If we are on LOOP_NO_EMPTY_SIZE, we can't -			 * set up a new clusters, so lets just skip it -			 * and let the allocator find whatever block -			 * it can find.  If we reach this point, we -			 * will have tried the cluster allocator -			 * plenty of times and not have found -			 * anything, so we are likely way too -			 * fragmented for the clustering stuff to find -			 * anything. -			 * -			 * However, if the cluster is taken from the -			 * current block group, release the cluster -			 * first, so that we stand a better chance of -			 * succeeding in the unclustered -			 * allocation.  */ -			if (loop >= LOOP_NO_EMPTY_SIZE && -			    used_block_group != block_group) { -				spin_unlock(&last_ptr->refill_lock); -				btrfs_release_block_group(used_block_group, -							  delalloc); -				goto unclustered_alloc; -			} +			struct btrfs_block_group_cache *cluster_bg = NULL; -			/* -			 * this cluster didn't work out, free it and -			 * start over -			 */ -			btrfs_return_cluster_to_free_space(NULL, last_ptr); - -			if (used_block_group != block_group) -				btrfs_release_block_group(used_block_group, -							  delalloc); -refill_cluster: -			if (loop >= LOOP_NO_EMPTY_SIZE) { -				spin_unlock(&last_ptr->refill_lock); -				goto unclustered_alloc; -			} - -			aligned_cluster = max_t(unsigned long, -						empty_cluster + empty_size, -					      block_group->full_stripe_len); +			ret = find_free_extent_clustered(block_group, last_ptr, +							 &ffe_ctl, &cluster_bg); -			/* allocate a cluster in this block group */ -			ret = btrfs_find_space_cluster(fs_info, block_group, -						       last_ptr, search_start, -						       num_bytes, -						       aligned_cluster);  			if (ret == 0) { -				/* -				 * now pull our allocation out of this -				 * cluster -				 */ -				offset = btrfs_alloc_from_cluster(block_group, -							last_ptr, -							num_bytes, -							search_start, -							&max_extent_size); -				if (offset) { -					/* we found one, proceed */ -					spin_unlock(&last_ptr->refill_lock); -					trace_btrfs_reserve_extent_cluster( -						block_group, search_start, -						num_bytes); -					goto checks; +				if (cluster_bg && cluster_bg != block_group) { +					btrfs_release_block_group(block_group, +								  delalloc); +					block_group = cluster_bg;  				} -			} else if (!cached && loop > LOOP_CACHING_NOWAIT -				   && !failed_cluster_refill) { -				spin_unlock(&last_ptr->refill_lock); - -				failed_cluster_refill = true; -				wait_block_group_cache_progress(block_group, -				       num_bytes + empty_cluster + empty_size); +				goto checks; +			} else if (ret == -EAGAIN) {  				goto have_block_group; -			} - -			/* -			 * at this point we either didn't find a cluster -			 * or we weren't able to allocate a block from our -			 * cluster.  Free the cluster we've been trying -			 * to use, and go to the next block group -			 */ -			btrfs_return_cluster_to_free_space(NULL, last_ptr); -			spin_unlock(&last_ptr->refill_lock); -			goto loop; -		} - -unclustered_alloc: -		/* -		 * We are doing an unclustered alloc, set the fragmented flag so -		 * we don't bother trying to setup a cluster again until we get -		 * more space. -		 */ -		if (unlikely(last_ptr)) { -			spin_lock(&last_ptr->lock); -			last_ptr->fragmented = 1; -			spin_unlock(&last_ptr->lock); -		} -		if (cached) { -			struct btrfs_free_space_ctl *ctl = -				block_group->free_space_ctl; - -			spin_lock(&ctl->tree_lock); -			if (ctl->free_space < -			    num_bytes + empty_cluster + empty_size) { -				max_free_space = max(max_free_space, -						     ctl->free_space); -				spin_unlock(&ctl->tree_lock); +			} else if (ret > 0) {  				goto loop;  			} -			spin_unlock(&ctl->tree_lock); +			/* ret == -ENOENT case falls through */  		} -		offset = btrfs_find_space_for_alloc(block_group, search_start, -						    num_bytes, empty_size, -						    &max_extent_size); -		/* -		 * If we didn't find a chunk, and we haven't failed on this -		 * block group before, and this block group is in the middle of -		 * caching and we are ok with waiting, then go ahead and wait -		 * for progress to be made, and set failed_alloc to true. -		 * -		 * If failed_alloc is true then we've already waited on this -		 * block group once and should move on to the next block group. -		 */ -		if (!offset && !failed_alloc && !cached && -		    loop > LOOP_CACHING_NOWAIT) { -			wait_block_group_cache_progress(block_group, -						num_bytes + empty_size); -			failed_alloc = true; +		ret = find_free_extent_unclustered(block_group, last_ptr, +						   &ffe_ctl); +		if (ret == -EAGAIN)  			goto have_block_group; -		} else if (!offset) { +		else if (ret > 0)  			goto loop; -		} +		/* ret == 0 case falls through */  checks: -		search_start = round_up(offset, fs_info->stripesize); +		ffe_ctl.search_start = round_up(ffe_ctl.found_offset, +					     fs_info->stripesize);  		/* move on to the next group */ -		if (search_start + num_bytes > +		if (ffe_ctl.search_start + num_bytes >  		    block_group->key.objectid + block_group->key.offset) { -			btrfs_add_free_space(block_group, offset, num_bytes); +			btrfs_add_free_space(block_group, ffe_ctl.found_offset, +					     num_bytes);  			goto loop;  		} -		if (offset < search_start) -			btrfs_add_free_space(block_group, offset, -					     search_start - offset); +		if (ffe_ctl.found_offset < ffe_ctl.search_start) +			btrfs_add_free_space(block_group, ffe_ctl.found_offset, +				ffe_ctl.search_start - ffe_ctl.found_offset);  		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,  				num_bytes, delalloc);  		if (ret == -EAGAIN) { -			btrfs_add_free_space(block_group, offset, num_bytes); +			btrfs_add_free_space(block_group, ffe_ctl.found_offset, +					     num_bytes);  			goto loop;  		}  		btrfs_inc_block_group_reservations(block_group);  		/* we are all good, lets return */ -		ins->objectid = search_start; +		ins->objectid = ffe_ctl.search_start;  		ins->offset = num_bytes; -		trace_btrfs_reserve_extent(block_group, search_start, num_bytes); +		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, +					   num_bytes);  		btrfs_release_block_group(block_group, delalloc);  		break;  loop: -		failed_cluster_refill = false; -		failed_alloc = false; +		ffe_ctl.retry_clustered = false; +		ffe_ctl.retry_unclustered = false;  		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != -		       index); +		       ffe_ctl.index);  		btrfs_release_block_group(block_group, delalloc);  		cond_resched();  	}  	up_read(&space_info->groups_sem); -	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg -		&& !orig_have_caching_bg) -		orig_have_caching_bg = true; - -	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) -		goto search; - -	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) +	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, +					   full_search, use_cluster); +	if (ret > 0)  		goto search; -	/* -	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking -	 *			caching kthreads as we move along -	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching -	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again -	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try -	 *			again -	 */ -	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { -		index = 0; -		if (loop == LOOP_CACHING_NOWAIT) { -			/* -			 * We want to skip the LOOP_CACHING_WAIT step if we -			 * don't have any uncached bgs and we've already done a -			 * full search through. -			 */ -			if (orig_have_caching_bg || !full_search) -				loop = LOOP_CACHING_WAIT; -			else -				loop = LOOP_ALLOC_CHUNK; -		} else { -			loop++; -		} - -		if (loop == LOOP_ALLOC_CHUNK) { -			struct btrfs_trans_handle *trans; -			int exist = 0; - -			trans = current->journal_info; -			if (trans) -				exist = 1; -			else -				trans = btrfs_join_transaction(root); - -			if (IS_ERR(trans)) { -				ret = PTR_ERR(trans); -				goto out; -			} - -			ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); - -			/* -			 * If we can't allocate a new chunk we've already looped -			 * through at least once, move on to the NO_EMPTY_SIZE -			 * case. -			 */ -			if (ret == -ENOSPC) -				loop = LOOP_NO_EMPTY_SIZE; - -			/* -			 * Do not bail out on ENOSPC since we -			 * can do more things. -			 */ -			if (ret < 0 && ret != -ENOSPC) -				btrfs_abort_transaction(trans, ret); -			else -				ret = 0; -			if (!exist) -				btrfs_end_transaction(trans); -			if (ret) -				goto out; -		} - -		if (loop == LOOP_NO_EMPTY_SIZE) { -			/* -			 * Don't loop again if we already have no empty_size and -			 * no empty_cluster. -			 */ -			if (empty_size == 0 && -			    empty_cluster == 0) { -				ret = -ENOSPC; -				goto out; -			} -			empty_size = 0; -			empty_cluster = 0; -		} - -		goto search; -	} else if (!ins->objectid) { -		ret = -ENOSPC; -	} else if (ins->objectid) { -		if (!use_cluster && last_ptr) { -			spin_lock(&last_ptr->lock); -			last_ptr->window_start = ins->objectid; -			spin_unlock(&last_ptr->lock); -		} -		ret = 0; -	} -out:  	if (ret == -ENOSPC) { -		if (!max_extent_size) -			max_extent_size = max_free_space; +		/* +		 * Use ffe_ctl->total_free_space as fallback if we can't find +		 * any contiguous hole. +		 */ +		if (!ffe_ctl.max_extent_size) +			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;  		spin_lock(&space_info->lock); -		space_info->max_extent_size = max_extent_size; +		space_info->max_extent_size = ffe_ctl.max_extent_size;  		spin_unlock(&space_info->lock); -		ins->offset = max_extent_size; +		ins->offset = ffe_ctl.max_extent_size;  	}  	return ret;  } @@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	btrfs_set_header_generation(buf, trans->transid);  	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);  	btrfs_set_header_owner(buf, owner); -	write_extent_buffer_fsid(buf, fs_info->fsid); +	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);  	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);  	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {  		buf->log_index = root->log_transid % 2;  		/*  		 * we allow two log transactions at a time, use different -		 * EXENT bit to differentiate dirty pages. +		 * EXTENT bit to differentiate dirty pages.  		 */  		if (buf->log_index == 0)  			set_extent_dirty(&root->dirty_log_pages, buf->start, @@ -8221,7 +8546,12 @@ again:  		goto again;  	} -	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { +	/* +	 * The global reserve still exists to save us from ourselves, so don't +	 * warn_on if we are short on our delayed refs reserve. +	 */ +	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && +	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {  		static DEFINE_RATELIMIT_STATE(_rs,  				DEFAULT_RATELIMIT_INTERVAL * 10,  				/*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  	u64 bytenr;  	u64 generation;  	u64 parent; -	u32 blocksize;  	struct btrfs_key key;  	struct btrfs_key first_key;  	struct extent_buffer *next; @@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);  	btrfs_node_key_to_cpu(path->nodes[level], &first_key,  			      path->slots[level]); -	blocksize = fs_info->nodesize;  	next = find_extent_buffer(fs_info, bytenr);  	if (!next) { @@ -8693,7 +9021,7 @@ skip:  					     ret);  			}  		} -		ret = btrfs_free_extent(trans, root, bytenr, blocksize, +		ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,  					parent, root->root_key.objectid,  					level - 1, 0);  		if (ret) @@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		goto out_free;  	} +	err = btrfs_run_delayed_items(trans); +	if (err) +		goto out_end_trans; +  	if (block_rsv)  		trans->block_rsv = block_rsv; +	/* +	 * This will help us catch people modifying the fs tree while we're +	 * dropping it.  It is unsafe to mess with the fs tree while it's being +	 * dropped as we unlock the root node and parent nodes as we walk down +	 * the tree, assuming nothing will change.  If something does change +	 * then we'll have stale information and drop references to blocks we've +	 * already dropped. +	 */ +	set_bit(BTRFS_ROOT_DELETING, &root->state);  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {  		level = btrfs_header_level(root->node);  		path->nodes[level] = btrfs_lock_root_node(root); @@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)  }  /* - * checks to see if its even possible to relocate this block group. + * Checks to see if it's even possible to relocate this block group.   *   * @return - -1 if it's not a good idea to relocate this block group, 0 if its   * ok to go ahead and try. @@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)  		 * check for two cases, either we are full, and therefore  		 * don't need to bother with the caching work since we won't  		 * find any space, or we are empty, and we can just add all -		 * the space in and be done with it.  This saves us _alot_ of +		 * the space in and be done with it.  This saves us _a_lot_ of  		 * time, particularly in the full case.  		 */  		if (found_key.offset == btrfs_block_group_used(&cache->item)) { @@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)  		add_block_group_free_space(trans, block_group);  		/* already aborted the transaction if it failed. */  next: +		btrfs_delayed_refs_rsv_release(fs_info, 1);  		list_del_init(&block_group->bg_list);  	}  	btrfs_trans_release_chunk_metadata(trans); @@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,  	link_block_group(cache);  	list_add_tail(&cache->bg_list, &trans->new_bgs); +	trans->delayed_ref_updates++; +	btrfs_update_delayed_refs_rsv(trans);  	set_avail_alloc_bits(fs_info, type);  	return 0; @@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	int factor;  	struct btrfs_caching_control *caching_ctl = NULL;  	bool remove_em; +	bool remove_rsv = false;  	block_group = btrfs_lookup_block_group(fs_info, group_start);  	BUG_ON(!block_group); @@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	mutex_lock(&trans->transaction->cache_write_mutex);  	/* -	 * make sure our free spache cache IO is done before remove the +	 * Make sure our free space cache IO is done before removing the  	 * free space inode  	 */  	spin_lock(&trans->transaction->dirty_bgs_lock); @@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	if (!list_empty(&block_group->dirty_list)) {  		list_del_init(&block_group->dirty_list); +		remove_rsv = true;  		btrfs_put_block_group(block_group);  	}  	spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	ret = btrfs_del_item(trans, root, path);  out: +	if (remove_rsv) +		btrfs_delayed_refs_rsv_release(fs_info, 1);  	btrfs_free_path(path);  	return ret;  } @@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		spin_lock(&space_info->lock);  		spin_lock(&block_group->lock); -		space_info->bytes_pinned -= block_group->pinned; +		update_bytes_pinned(space_info, -block_group->pinned);  		space_info->bytes_readonly += block_group->pinned;  		percpu_counter_add_batch(&space_info->total_bytes_pinned,  				   -block_group->pinned, @@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,  	if (!blk_queue_discard(bdev_get_queue(device->bdev)))  		return 0; -	/* Not writeable = nothing to do. */ +	/* Not writable = nothing to do. */  	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))  		return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d228f706ff3e..fc126b92ea59 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void)  static inline void __btrfs_debug_check_extent_io_range(const char *caller,  		struct extent_io_tree *tree, u64 start, u64 end)  { -	if (tree->ops && tree->ops->check_extent_io_range) -		tree->ops->check_extent_io_range(tree->private_data, caller, -						 start, end); +	struct inode *inode = tree->private_data; +	u64 isize; + +	if (!inode || !is_data_inode(inode)) +		return; + +	isize = i_size_read(inode); +	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { +		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, +		    "%s: ino %llu isize %llu odd range [%llu,%llu]", +			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); +	}  }  #else  #define btrfs_leak_debug_add(new, head)	do {} while (0) @@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,  	return tree_search_for_insert(tree, offset, NULL, NULL);  } -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, -		     struct extent_state *other) -{ -	if (tree->ops && tree->ops->merge_extent_hook) -		tree->ops->merge_extent_hook(tree->private_data, new, other); -} -  /*   * utility function to look for merge candidates inside a given range.   * Any extents with matching state are merged together into a single @@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree,  		other = rb_entry(other_node, struct extent_state, rb_node);  		if (other->end == state->start - 1 &&  		    other->state == state->state) { -			merge_cb(tree, state, other); +			if (tree->private_data && +			    is_data_inode(tree->private_data)) +				btrfs_merge_delalloc_extent(tree->private_data, +							    state, other);  			state->start = other->start;  			rb_erase(&other->rb_node, &tree->state);  			RB_CLEAR_NODE(&other->rb_node); @@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree,  		other = rb_entry(other_node, struct extent_state, rb_node);  		if (other->start == state->end + 1 &&  		    other->state == state->state) { -			merge_cb(tree, state, other); +			if (tree->private_data && +			    is_data_inode(tree->private_data)) +				btrfs_merge_delalloc_extent(tree->private_data, +							    state, other);  			state->end = other->end;  			rb_erase(&other->rb_node, &tree->state);  			RB_CLEAR_NODE(&other->rb_node); @@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree,  	}  } -static void set_state_cb(struct extent_io_tree *tree, -			 struct extent_state *state, unsigned *bits) -{ -	if (tree->ops && tree->ops->set_bit_hook) -		tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, -			   struct extent_state *state, unsigned *bits) -{ -	if (tree->ops && tree->ops->clear_bit_hook) -		tree->ops->clear_bit_hook(tree->private_data, state, bits); -} -  static void set_state_bits(struct extent_io_tree *tree,  			   struct extent_state *state, unsigned *bits,  			   struct extent_changeset *changeset); @@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree,  	return 0;  } -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, -		     u64 split) -{ -	if (tree->ops && tree->ops->split_extent_hook) -		tree->ops->split_extent_hook(tree->private_data, orig, split); -} -  /*   * split a given extent state struct in two, inserting the preallocated   * struct 'prealloc' as the newly created second half.  'split' indicates an @@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,  {  	struct rb_node *node; -	split_cb(tree, orig, split); +	if (tree->private_data && is_data_inode(tree->private_data)) +		btrfs_split_delalloc_extent(tree->private_data, orig, split);  	prealloc->start = orig->start;  	prealloc->end = split - 1; @@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state)  /*   * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1).   *   * If no bits are set on the state struct after clearing things, the   * struct is freed and removed from the tree @@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,  		WARN_ON(range > tree->dirty_bytes);  		tree->dirty_bytes -= range;  	} -	clear_state_cb(tree, state, bits); + +	if (tree->private_data && is_data_inode(tree->private_data)) +		btrfs_clear_delalloc_extent(tree->private_data, state, bits); +  	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);  	BUG_ON(ret < 0);  	state->state &= ~bits_to_clear; @@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree,  	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;  	int ret; -	set_state_cb(tree, state, bits); +	if (tree->private_data && is_data_inode(tree->private_data)) +		btrfs_set_delalloc_extent(tree->private_data, state, bits); +  	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {  		u64 range = state->end - state->start + 1;  		tree->dirty_bytes += range; @@ -1459,16 +1452,16 @@ out:   * find a contiguous range of bytes in the file marked as delalloc, not   * more than 'max_bytes'.  start and end are used to return the range,   * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree   */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree,  					u64 *start, u64 *end, u64 max_bytes,  					struct extent_state **cached_state)  {  	struct rb_node *node;  	struct extent_state *state;  	u64 cur_start = *start; -	u64 found = 0; +	bool found = false;  	u64 total_bytes = 0;  	spin_lock(&tree->lock); @@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,  	 */  	node = tree_search(tree, cur_start);  	if (!node) { -		if (!found) -			*end = (u64)-1; +		*end = (u64)-1;  		goto out;  	} @@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,  			*cached_state = state;  			refcount_inc(&state->refs);  		} -		found++; +		found = true;  		*end = state->end;  		cur_start = state->end + 1;  		node = rb_next(node); @@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode,  }  /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'.  start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes.  @Start and @end are used to return the range,   * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + *         false if nothing was in the tree   */ -static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,  				    struct extent_io_tree *tree,  				    struct page *locked_page, u64 *start, -				    u64 *end, u64 max_bytes) +				    u64 *end)  { +	u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;  	u64 delalloc_start;  	u64 delalloc_end; -	u64 found; +	bool found;  	struct extent_state *cached_state = NULL;  	int ret;  	int loops = 0; @@ -1585,7 +1580,7 @@ again:  		*start = delalloc_start;  		*end = delalloc_end;  		free_extent_state(cached_state); -		return 0; +		return false;  	}  	/* @@ -1605,6 +1600,7 @@ again:  	/* step two, lock all the pages after the page that has start */  	ret = lock_delalloc_pages(inode, locked_page,  				  delalloc_start, delalloc_end); +	ASSERT(!ret || ret == -EAGAIN);  	if (ret == -EAGAIN) {  		/* some of the pages are gone, lets avoid looping by  		 * shortening the size of the delalloc range we're searching @@ -1616,11 +1612,10 @@ again:  			loops = 1;  			goto again;  		} else { -			found = 0; +			found = false;  			goto out_failed;  		}  	} -	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */  	/* step three, lock the state bits for the whole range */  	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1643,17 +1638,6 @@ out_failed:  	return found;  } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, -				    struct extent_io_tree *tree, -				    struct page *locked_page, u64 *start, -				    u64 *end, u64 max_bytes) -{ -	return find_lock_delalloc_range(inode, tree, locked_page, start, end, -			max_bytes); -} -#endif -  static int __process_pages_contig(struct address_space *mapping,  				  struct page *locked_page,  				  pgoff_t start_index, pgoff_t end_index, @@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,  }  /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * This is a generic handler for readpage errors. If other copies exist, read + * those and write back good data to the failed position. Does not investigate + * in remapping the failed extent elsewhere, hoping the device will be smart + * enough to do this as needed   */ -  static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,  			      struct page *page, u64 start, u64 end,  			      int failed_mirror) @@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,  void end_extent_writepage(struct page *page, int err, u64 start, u64 end)  {  	int uptodate = (err == 0); -	struct extent_io_tree *tree;  	int ret = 0; -	tree = &BTRFS_I(page->mapping->host)->io_tree; - -	if (tree->ops && tree->ops->writepage_end_io_hook) -		tree->ops->writepage_end_io_hook(page, start, end, NULL, -				uptodate); +	btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);  	if (!uptodate) {  		ClearPageUptodate(page); @@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio)  		struct page *page = bvec->bv_page;  		struct inode *inode = page->mapping->host;  		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +		bool data_inode = btrfs_ino(BTRFS_I(inode)) +			!= BTRFS_BTREE_INODE_OBJECTID;  		btrfs_debug(fs_info,  			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio)  		len = bvec->bv_len;  		mirror = io_bio->mirror_num; -		if (likely(uptodate && tree->ops)) { +		if (likely(uptodate)) {  			ret = tree->ops->readpage_end_io_hook(io_bio, offset,  							      page, start, end,  							      mirror); @@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio)  		if (likely(uptodate))  			goto readpage_ok; -		if (tree->ops) { -			ret = tree->ops->readpage_io_failed_hook(page, mirror); -			if (ret == -EAGAIN) { -				/* -				 * Data inode's readpage_io_failed_hook() always -				 * returns -EAGAIN. -				 * -				 * The generic bio_readpage_error handles errors -				 * the following way: If possible, new read -				 * requests are created and submitted and will -				 * end up in end_bio_extent_readpage as well (if -				 * we're lucky, not in the !uptodate case). In -				 * that case it returns 0 and we just go on with -				 * the next page in our bio. If it can't handle -				 * the error it will return -EIO and we remain -				 * responsible for that page. -				 */ -				ret = bio_readpage_error(bio, offset, page, -							 start, end, mirror); -				if (ret == 0) { -					uptodate = !bio->bi_status; -					offset += len; -					continue; -				} -			} +		if (data_inode) {  			/* -			 * metadata's readpage_io_failed_hook() always returns -			 * -EIO and fixes nothing.  -EIO is also returned if -			 * data inode error could not be fixed. +			 * The generic bio_readpage_error handles errors the +			 * following way: If possible, new read requests are +			 * created and submitted and will end up in +			 * end_bio_extent_readpage as well (if we're lucky, +			 * not in the !uptodate case). In that case it returns +			 * 0 and we just go on with the next page in our bio. +			 * If it can't handle the error it will return -EIO and +			 * we remain responsible for that page.  			 */ -			ASSERT(ret == -EIO); +			ret = bio_readpage_error(bio, offset, page, start, end, +						 mirror); +			if (ret == 0) { +				uptodate = !bio->bi_status; +				offset += len; +				continue; +			} +		} else { +			struct extent_buffer *eb; + +			eb = (struct extent_buffer *)page->private; +			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); +			eb->read_mirror = mirror; +			atomic_dec(&eb->io_pages); +			if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, +					       &eb->bflags)) +				btree_readahead_hook(eb, -EIO); + +			ret = -EIO;  		}  readpage_ok:  		if (likely(uptodate)) { @@ -2607,7 +2585,7 @@ readpage_ok:  			unsigned off;  			/* Zero out the end if this page straddles i_size */ -			off = i_size & (PAGE_SIZE-1); +			off = offset_in_page(i_size);  			if (page->index == end_index && off)  				zero_user_segment(page, off, PAGE_SIZE);  			SetPageUptodate(page); @@ -2644,8 +2622,7 @@ readpage_ok:  	if (extent_len)  		endio_readpage_release_extent(tree, extent_start, extent_len,  					      uptodate); -	if (io_bio->end_io) -		io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); +	btrfs_io_bio_free_csum(io_bio);  	bio_put(bio);  } @@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,  		else  			contig = bio_end_sector(bio) == sector; -		if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, -						      bio, bio_flags)) +		ASSERT(tree->ops); +		if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))  			can_merge = false;  		if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree,  	if (page->index == last_byte >> PAGE_SHIFT) {  		char *userpage; -		size_t zero_offset = last_byte & (PAGE_SIZE - 1); +		size_t zero_offset = offset_in_page(last_byte);  		if (zero_offset) {  			iosize = PAGE_SIZE - zero_offset; @@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc,  /*   * helper for __extent_writepage, doing all of the delayed allocation setup.   * - * This returns 1 if our fill_delalloc function did all the work required + * This returns 1 if btrfs_run_delalloc_range function did all the work required   * to write the page (copy into inline extent).  In this case the IO has   * been started and the page is already unlocked.   * @@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc,   * This returns < 0 if there were errors (page still locked)   */  static noinline_for_stack int writepage_delalloc(struct inode *inode, -			      struct page *page, struct writeback_control *wbc, -			      struct extent_page_data *epd, -			      u64 delalloc_start, -			      unsigned long *nr_written) +		struct page *page, struct writeback_control *wbc, +		u64 delalloc_start, unsigned long *nr_written)  { -	struct extent_io_tree *tree = epd->tree; +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;  	u64 page_end = delalloc_start + PAGE_SIZE - 1; -	u64 nr_delalloc; +	bool found;  	u64 delalloc_to_write = 0;  	u64 delalloc_end = 0;  	int ret;  	int page_started = 0; -	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) -		return 0;  	while (delalloc_end < page_end) { -		nr_delalloc = find_lock_delalloc_range(inode, tree, +		found = find_lock_delalloc_range(inode, tree,  					       page,  					       &delalloc_start, -					       &delalloc_end, -					       BTRFS_MAX_EXTENT_SIZE); -		if (nr_delalloc == 0) { +					       &delalloc_end); +		if (!found) {  			delalloc_start = delalloc_end + 1;  			continue;  		} -		ret = tree->ops->fill_delalloc(inode, page, -					       delalloc_start, -					       delalloc_end, -					       &page_started, -					       nr_written, wbc); +		ret = btrfs_run_delalloc_range(inode, page, delalloc_start, +				delalloc_end, &page_started, nr_written, wbc);  		/* File system has been set read-only */  		if (ret) {  			SetPageError(page); -			/* fill_delalloc should be return < 0 for error -			 * but just in case, we use > 0 here meaning the -			 * IO is started, so we don't want to return > 0 -			 * unless things are going well. +			/* +			 * btrfs_run_delalloc_range should return < 0 for error +			 * but just in case, we use > 0 here meaning the IO is +			 * started, so we don't want to return > 0 unless +			 * things are going well.  			 */  			ret = ret < 0 ? ret : -EIO;  			goto done; @@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  	int nr = 0;  	bool compressed; -	if (tree->ops && tree->ops->writepage_start_hook) { -		ret = tree->ops->writepage_start_hook(page, start, -						      page_end); -		if (ret) { -			/* Fixup worker will requeue */ -			if (ret == -EBUSY) -				wbc->pages_skipped++; -			else -				redirty_page_for_writepage(wbc, page); +	ret = btrfs_writepage_cow_fixup(page, start, page_end); +	if (ret) { +		/* Fixup worker will requeue */ +		if (ret == -EBUSY) +			wbc->pages_skipped++; +		else +			redirty_page_for_writepage(wbc, page); -			update_nr_written(wbc, nr_written); -			unlock_page(page); -			return 1; -		} +		update_nr_written(wbc, nr_written); +		unlock_page(page); +		return 1;  	}  	/* @@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  	end = page_end;  	if (i_size <= start) { -		if (tree->ops && tree->ops->writepage_end_io_hook) -			tree->ops->writepage_end_io_hook(page, start, -							 page_end, NULL, 1); +		btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);  		goto done;  	} @@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  		u64 offset;  		if (cur >= i_size) { -			if (tree->ops && tree->ops->writepage_end_io_hook) -				tree->ops->writepage_end_io_hook(page, cur, -							 page_end, NULL, 1); +			btrfs_writepage_endio_finish_ordered(page, cur, +							     page_end, 1);  			break;  		}  		em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, @@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  			 * end_io notification does not happen here for  			 * compressed extents  			 */ -			if (!compressed && tree->ops && -			    tree->ops->writepage_end_io_hook) -				tree->ops->writepage_end_io_hook(page, cur, -							 cur + iosize - 1, -							 NULL, 1); +			if (!compressed) +				btrfs_writepage_endio_finish_ordered(page, cur, +							    cur + iosize - 1, +							    1);  			else if (compressed) {  				/* we don't want to end_page_writeback on  				 * a compressed extent.  this happens @@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	ClearPageError(page); -	pg_offset = i_size & (PAGE_SIZE - 1); +	pg_offset = offset_in_page(i_size);  	if (page->index > end_index ||  	   (page->index == end_index && !pg_offset)) {  		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	set_page_extent_mapped(page); -	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); -	if (ret == 1) -		goto done_unlocked; -	if (ret) -		goto done; +	if (!epd->extent_locked) { +		ret = writepage_delalloc(inode, page, wbc, start, &nr_written); +		if (ret == 1) +			goto done_unlocked; +		if (ret) +			goto done; +	}  	ret = __extent_writepage_io(inode, page, wbc, epd,  				    i_size, nr_written, write_flags, &nr); @@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping,  			range_whole = 1;  		scanned = 1;  	} -	if (wbc->sync_mode == WB_SYNC_ALL) + +	/* +	 * We do the tagged writepage as long as the snapshot flush bit is set +	 * and we are the first one who do the filemap_flush() on this inode. +	 * +	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do +	 * not race in and drop the bit. +	 */ +	if (range_whole && wbc->nr_to_write == LONG_MAX && +	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, +			       &BTRFS_I(inode)->runtime_flags)) +		wbc->tagged_writepages = 1; + +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)  		tag = PAGECACHE_TAG_TOWRITE;  	else  		tag = PAGECACHE_TAG_DIRTY;  retry: -	if (wbc->sync_mode == WB_SYNC_ALL) +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)  		tag_pages_for_writeback(mapping, index, end);  	done_index = index;  	while (!done && !nr_to_write_done && (index <= end) && @@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,  		if (clear_page_dirty_for_io(page))  			ret = __extent_writepage(page, &wbc_writepages, &epd);  		else { -			if (tree->ops && tree->ops->writepage_end_io_hook) -				tree->ops->writepage_end_io_hook(page, start, -						 start + PAGE_SIZE - 1, -						 NULL, 1); +			btrfs_writepage_endio_finish_ordered(page, start, +						    start + PAGE_SIZE - 1, 1);  			unlock_page(page);  		}  		put_page(page); @@ -4118,42 +4094,36 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,  		     unsigned nr_pages)  {  	struct bio *bio = NULL; -	unsigned page_idx;  	unsigned long bio_flags = 0;  	struct page *pagepool[16]; -	struct page *page;  	struct extent_map *em_cached = NULL;  	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;  	int nr = 0;  	u64 prev_em_start = (u64)-1; -	for (page_idx = 0; page_idx < nr_pages; page_idx++) { -		page = list_entry(pages->prev, struct page, lru); +	while (!list_empty(pages)) { +		for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { +			struct page *page = list_entry(pages->prev, +						       struct page, lru); -		prefetchw(&page->flags); -		list_del(&page->lru); -		if (add_to_page_cache_lru(page, mapping, -					page->index, -					readahead_gfp_mask(mapping))) { -			put_page(page); -			continue; +			prefetchw(&page->flags); +			list_del(&page->lru); +			if (add_to_page_cache_lru(page, mapping, page->index, +						readahead_gfp_mask(mapping))) { +				put_page(page); +				continue; +			} + +			pagepool[nr++] = page;  		} -		pagepool[nr++] = page; -		if (nr < ARRAY_SIZE(pagepool)) -			continue;  		__extent_readpages(tree, pagepool, nr, &em_cached, &bio, -				&bio_flags, &prev_em_start); -		nr = 0; +				   &bio_flags, &prev_em_start);  	} -	if (nr) -		__extent_readpages(tree, pagepool, nr, &em_cached, &bio, -				&bio_flags, &prev_em_start);  	if (em_cached)  		free_extent_map(em_cached); -	BUG_ON(!list_empty(pages));  	if (bio)  		return submit_one_bio(bio, 0, bio_flags);  	return 0; @@ -4342,7 +4312,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,  	/*  	 * Sanity check, extent_fiemap() should have ensured that new -	 * fiemap extent won't overlap with cahced one. +	 * fiemap extent won't overlap with cached one.  	 * Not recoverable.  	 *  	 * NOTE: Physical address can overlap, due to compression @@ -4914,13 +4884,6 @@ again:  	check_buffer_tree_ref(eb);  	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); -	/* -	 * We will free dummy extent buffer's if they come into -	 * free_extent_buffer with a ref count of 2, but if we are using this we -	 * want the buffers to stay in memory until we're done with them, so -	 * bump the ref count again. -	 */ -	atomic_inc(&eb->refs);  	return eb;  free_eb:  	btrfs_release_extent_buffer(eb); @@ -5102,7 +5065,9 @@ void free_extent_buffer(struct extent_buffer *eb)  	while (1) {  		refs = atomic_read(&eb->refs); -		if (refs <= 3) +		if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) +		    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && +			refs == 1))  			break;  		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);  		if (old == refs) @@ -5111,10 +5076,6 @@ void free_extent_buffer(struct extent_buffer *eb)  	spin_lock(&eb->refs_lock);  	if (atomic_read(&eb->refs) == 2 && -	    test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) -		atomic_dec(&eb->refs); - -	if (atomic_read(&eb->refs) == 2 &&  	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&  	    !extent_buffer_under_io(eb) &&  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5340,7 +5301,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,  	struct page *page;  	char *kaddr;  	char *dst = (char *)dstv; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	if (start + len > eb->len) { @@ -5350,7 +5311,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,  		return;  	} -	offset = (start_offset + start) & (PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + start);  	while (len > 0) {  		page = eb->pages[i]; @@ -5375,14 +5336,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,  	struct page *page;  	char *kaddr;  	char __user *dst = (char __user *)dstv; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	int ret = 0;  	WARN_ON(start > eb->len);  	WARN_ON(start + len > eb->start + eb->len); -	offset = (start_offset + start) & (PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + start);  	while (len > 0) {  		page = eb->pages[i]; @@ -5413,10 +5374,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb,  			      char **map, unsigned long *map_start,  			      unsigned long *map_len)  { -	size_t offset = start & (PAGE_SIZE - 1); +	size_t offset;  	char *kaddr;  	struct page *p; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	unsigned long end_i = (start_offset + start + min_len - 1) >>  		PAGE_SHIFT; @@ -5453,14 +5414,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,  	struct page *page;  	char *kaddr;  	char *ptr = (char *)ptrv; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	int ret = 0;  	WARN_ON(start > eb->len);  	WARN_ON(start + len > eb->start + eb->len); -	offset = (start_offset + start) & (PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + start);  	while (len > 0) {  		page = eb->pages[i]; @@ -5509,13 +5470,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,  	struct page *page;  	char *kaddr;  	char *src = (char *)srcv; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	WARN_ON(start > eb->len);  	WARN_ON(start + len > eb->start + eb->len); -	offset = (start_offset + start) & (PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + start);  	while (len > 0) {  		page = eb->pages[i]; @@ -5539,13 +5500,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,  	size_t offset;  	struct page *page;  	char *kaddr; -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	unsigned long i = (start_offset + start) >> PAGE_SHIFT;  	WARN_ON(start > eb->len);  	WARN_ON(start + len > eb->start + eb->len); -	offset = (start_offset + start) & (PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + start);  	while (len > 0) {  		page = eb->pages[i]; @@ -5584,13 +5545,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,  	size_t offset;  	struct page *page;  	char *kaddr; -	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(dst->start);  	unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;  	WARN_ON(src->len != dst_len); -	offset = (start_offset + dst_offset) & -		(PAGE_SIZE - 1); +	offset = offset_in_page(start_offset + dst_offset);  	while (len > 0) {  		page = dst->pages[i]; @@ -5626,7 +5586,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,  				    unsigned long *page_index,  				    size_t *page_offset)  { -	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(eb->start);  	size_t byte_offset = BIT_BYTE(nr);  	size_t offset; @@ -5638,7 +5598,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,  	offset = start_offset + start + byte_offset;  	*page_index = offset >> PAGE_SHIFT; -	*page_offset = offset & (PAGE_SIZE - 1); +	*page_offset = offset_in_page(offset);  }  /** @@ -5780,7 +5740,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,  	size_t cur;  	size_t dst_off_in_page;  	size_t src_off_in_page; -	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(dst->start);  	unsigned long dst_i;  	unsigned long src_i; @@ -5798,10 +5758,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,  	}  	while (len > 0) { -		dst_off_in_page = (start_offset + dst_offset) & -			(PAGE_SIZE - 1); -		src_off_in_page = (start_offset + src_offset) & -			(PAGE_SIZE - 1); +		dst_off_in_page = offset_in_page(start_offset + dst_offset); +		src_off_in_page = offset_in_page(start_offset + src_offset);  		dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;  		src_i = (start_offset + src_offset) >> PAGE_SHIFT; @@ -5829,7 +5787,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,  	size_t src_off_in_page;  	unsigned long dst_end = dst_offset + len - 1;  	unsigned long src_end = src_offset + len - 1; -	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); +	size_t start_offset = offset_in_page(dst->start);  	unsigned long dst_i;  	unsigned long src_i; @@ -5853,10 +5811,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,  		dst_i = (start_offset + dst_end) >> PAGE_SHIFT;  		src_i = (start_offset + src_end) >> PAGE_SHIFT; -		dst_off_in_page = (start_offset + dst_end) & -			(PAGE_SIZE - 1); -		src_off_in_page = (start_offset + src_end) & -			(PAGE_SIZE - 1); +		dst_off_in_page = offset_in_page(start_offset + dst_end); +		src_off_in_page = offset_in_page(start_offset + src_end);  		cur = min_t(unsigned long, len, src_off_in_page + 1);  		cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..9673be3f3d1f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -37,18 +37,22 @@  #define EXTENT_BIO_COMPRESSED 1  #define EXTENT_BIO_FLAG_SHIFT 16 -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_READ_ERR 8        /* read IO error */ -#define EXTENT_BUFFER_UNMAPPED 9 -#define EXTENT_BUFFER_IN_TREE 10 -#define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */ +enum { +	EXTENT_BUFFER_UPTODATE, +	EXTENT_BUFFER_DIRTY, +	EXTENT_BUFFER_CORRUPT, +	/* this got triggered by readahead */ +	EXTENT_BUFFER_READAHEAD, +	EXTENT_BUFFER_TREE_REF, +	EXTENT_BUFFER_STALE, +	EXTENT_BUFFER_WRITEBACK, +	/* read IO error */ +	EXTENT_BUFFER_READ_ERR, +	EXTENT_BUFFER_UNMAPPED, +	EXTENT_BUFFER_IN_TREE, +	/* write IO error */ +	EXTENT_BUFFER_WRITE_ERR, +};  /* these are flags for __process_pages_contig */  #define PAGE_UNLOCK		(1 << 0) @@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,  struct extent_io_ops {  	/* -	 * The following callbacks must be allways defined, the function +	 * The following callbacks must be always defined, the function  	 * pointer will be called unconditionally.  	 */  	extent_submit_bio_hook_t *submit_bio_hook;  	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,  				    struct page *page, u64 start, u64 end,  				    int mirror); -	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - -	/* -	 * Optional hooks, called if the pointer is not NULL -	 */ -	int (*fill_delalloc)(void *private_data, struct page *locked_page, -			     u64 start, u64 end, int *page_started, -			     unsigned long *nr_written, -			     struct writeback_control *wbc); - -	int (*writepage_start_hook)(struct page *page, u64 start, u64 end); -	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, -				      struct extent_state *state, int uptodate); -	void (*set_bit_hook)(void *private_data, struct extent_state *state, -			     unsigned *bits); -	void (*clear_bit_hook)(void *private_data, -			struct extent_state *state, -			unsigned *bits); -	void (*merge_extent_hook)(void *private_data, -				  struct extent_state *new, -				  struct extent_state *other); -	void (*split_extent_hook)(void *private_data, -				  struct extent_state *orig, u64 split); -	void (*check_extent_io_range)(void *private_data, const char *caller, -				      u64 start, u64 end);  };  struct extent_io_tree { @@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,  }  static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, -		u64 end) +				     u64 end, struct extent_state **cached)  {  	return clear_extent_bit(tree, start, end,  				EXTENT_DIRTY | EXTENT_DELALLOC | -				EXTENT_DO_ACCOUNTING, 0, 0, NULL); +				EXTENT_DO_ACCOUNTING, 0, 0, cached);  }  int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree,  		    struct extent_io_tree *io_tree,  		    struct io_failure_record *rec);  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, -				      struct extent_io_tree *tree, -				      struct page *locked_page, u64 *start, -				      u64 *end, u64 max_bytes); +bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, +			     struct page *locked_page, u64 *start, +			     u64 *end);  #endif  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,  					       u64 start); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7eea8b6e2cd3..a042a193c120 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em)  	return container_of(prev, struct extent_map, rb_node);  } -/* helper for btfs_get_extent.  Given an existing extent in the tree, +/* + * Helper for btrfs_get_extent.  Given an existing extent in the tree,   * the existing extent is the nearest extent to map_start,   * and an extent that you want to insert, deal with overlap and insert   * the best fitted new extent into the tree. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 31977ffd6190..ef05a0121652 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -11,13 +11,20 @@  #define EXTENT_MAP_INLINE ((u64)-2)  #define EXTENT_MAP_DELALLOC ((u64)-1) -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ -#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ -#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ -#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ +/* bits for the extent_map::flags field */ +enum { +	/* this entry not yet on disk, don't free it */ +	EXTENT_FLAG_PINNED, +	EXTENT_FLAG_COMPRESSED, +	/* pre-allocated extent */ +	EXTENT_FLAG_PREALLOC, +	/* Logging this extent */ +	EXTENT_FLAG_LOGGING, +	/* Filling in a preallocated extent */ +	EXTENT_FLAG_FILLING, +	/* filesystem extent mapping type */ +	EXTENT_FLAG_FS_MAPPING, +};  struct extent_map {  	struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ba74827beb32..920bf3b4b0ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,  	return ret;  } -static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) -{ -	kfree(bio->csum_allocated); -} -  static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,  				   u64 logical_offset, u32 *dst, int dio)  { @@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio  	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;  	if (!dst) {  		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { -			btrfs_bio->csum_allocated = kmalloc_array(nblocks, -					csum_size, GFP_NOFS); -			if (!btrfs_bio->csum_allocated) { +			btrfs_bio->csum = kmalloc_array(nblocks, csum_size, +							GFP_NOFS); +			if (!btrfs_bio->csum) {  				btrfs_free_path(path);  				return BLK_STS_RESOURCE;  			} -			btrfs_bio->csum = btrfs_bio->csum_allocated; -			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;  		} else {  			btrfs_bio->csum = btrfs_bio->csum_inline;  		} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58e93bce3036..d38dc8c31533 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,  	size_t copied = 0;  	size_t total_copied = 0;  	int pg = 0; -	int offset = pos & (PAGE_SIZE - 1); +	int offset = offset_in_page(pos);  	while (write_bytes > 0) {  		size_t count = min_t(size_t, @@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,  		return -ENOMEM;  	while (iov_iter_count(i) > 0) { -		size_t offset = pos & (PAGE_SIZE - 1); +		size_t offset = offset_in_page(pos);  		size_t sector_offset;  		size_t write_bytes = min(iov_iter_count(i),  					 nrptrs * (size_t)PAGE_SIZE - @@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)  	filp->private_data = NULL;  	/* -	 * ordered_data_close is set by settattr when we are about to truncate +	 * ordered_data_close is set by setattr when we are about to truncate  	 * a file from a non-zero size to a zero size.  This tries to  	 * flush down new bytes that may have been written if the  	 * application were using truncate to replace a file in place. @@ -2114,7 +2114,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	/*  	 * We have to do this here to avoid the priority inversion of waiting on -	 * IO of a lower priority task while holding a transaciton open. +	 * IO of a lower priority task while holding a transaction open.  	 */  	ret = btrfs_wait_ordered_range(inode, start, len);  	if (ret) { @@ -2154,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 * here we could get into a situation where we're waiting on IO to  	 * happen that is blocked on a transaction trying to commit.  With start  	 * we inc the extwriter counter, so we wait for all extwriters to exit -	 * before we start blocking join'ers.  This comment is to keep somebody +	 * before we start blocking joiners.  This comment is to keep somebody  	 * from thinking they are super smart and changing this to  	 * btrfs_join_transaction *cough*Josef*cough*.  	 */ @@ -2186,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	up_write(&BTRFS_I(inode)->dio_sem);  	inode_unlock(inode); -	/* -	 * If any of the ordered extents had an error, just return it to user -	 * space, so that the application knows some writes didn't succeed and -	 * can take proper action (retry for e.g.). Blindly committing the -	 * transaction in this case, would fool userspace that everything was -	 * successful. And we also want to make sure our log doesn't contain -	 * file extent items pointing to extents that weren't fully written to - -	 * just like in the non fast fsync path, where we check for the ordered -	 * operation's error flag before writing to the log tree and return -EIO -	 * if any of them had this flag set (btrfs_wait_ordered_range) - -	 * therefore we need to check for errors in the ordered operations, -	 * which are indicated by ctx.io_err. -	 */ -	if (ctx.io_err) { -		btrfs_end_transaction(trans); -		ret = ctx.io_err; -		goto out; -	} -  	if (ret != BTRFS_NO_LOG_SYNC) {  		if (!ret) {  			ret = btrfs_sync_log(trans, root, &ctx); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d6736595ec57..e5089087eaa6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -74,11 +74,11 @@ out:  	return ret;  } -struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, -		       struct btrfs_fs_info *fs_info, -		       struct btrfs_block_group_cache *block_group, -		       struct btrfs_path *path, int cow) +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( +		struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, +		struct btrfs_block_group_cache *block_group, +		struct btrfs_path *path, int cow)  {  	struct btrfs_root *root = fs_info->free_space_root;  	struct btrfs_key key; @@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)  	}  } +EXPORT_FOR_TESTS  int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,  				  struct btrfs_block_group_cache *block_group,  				  struct btrfs_path *path) @@ -315,6 +316,7 @@ out:  	return ret;  } +EXPORT_FOR_TESTS  int convert_free_space_to_extents(struct btrfs_trans_handle *trans,  				  struct btrfs_block_group_cache *block_group,  				  struct btrfs_path *path) @@ -487,6 +489,7 @@ out:  	return ret;  } +EXPORT_FOR_TESTS  int free_space_test_bit(struct btrfs_block_group_cache *block_group,  			struct btrfs_path *path, u64 offset)  { @@ -775,6 +778,7 @@ out:  	return ret;  } +EXPORT_FOR_TESTS  int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,  				  struct btrfs_block_group_cache *block_group,  				  struct btrfs_path *path, u64 start, u64 size) @@ -968,6 +972,7 @@ out:  	return ret;  } +EXPORT_FOR_TESTS  int __add_to_free_space_tree(struct btrfs_trans_handle *trans,  			     struct btrfs_block_group_cache *block_group,  			     struct btrfs_path *path, u64 start, u64 size) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9ea4c6f0352f..43eb4535319d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -27,6 +27,7 @@  #include <linux/uio.h>  #include <linux/magic.h>  #include <linux/iversion.h> +#include <linux/swap.h>  #include <asm/unaligned.h>  #include "ctree.h"  #include "disk-io.h" @@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode,  /*   * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback.   *   * NOTE: caller must ensure that when an error happens, it can not call   * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING   * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata   * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()).   */  static inline void btrfs_cleanup_ordered_extents(struct inode *inode, -						 const u64 offset, -						 const u64 bytes) +						 struct page *locked_page, +						 u64 offset, u64 bytes)  {  	unsigned long index = offset >> PAGE_SHIFT;  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; +	u64 page_start = page_offset(locked_page); +	u64 page_end = page_start + PAGE_SIZE - 1; +  	struct page *page;  	while (index <= end_index) { @@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,  		ClearPagePrivate2(page);  		put_page(page);  	} -	return __endio_write_update_ordered(inode, offset + PAGE_SIZE, -					    bytes - PAGE_SIZE, false); + +	/* +	 * In case this page belongs to the delalloc range being instantiated +	 * then skip it, since the first page of a range is going to be +	 * properly cleaned up by the caller of run_delalloc_range +	 */ +	if (page_start >= offset && page_end <= (offset + bytes - 1)) { +		offset += PAGE_SIZE; +		bytes -= PAGE_SIZE; +	} + +	return __endio_write_update_ordered(inode, offset, bytes, false);  }  static int btrfs_dirty_inode(struct inode *inode); @@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,  				     start >> PAGE_SHIFT);  		btrfs_set_file_extent_compression(leaf, ei, 0);  		kaddr = kmap_atomic(page); -		offset = start & (PAGE_SIZE - 1); +		offset = offset_in_page(start);  		write_extent_buffer(leaf, kaddr + offset, ptr, size);  		kunmap_atomic(kaddr);  		put_page(page); @@ -357,7 +368,7 @@ struct async_extent {  struct async_cow {  	struct inode *inode; -	struct btrfs_root *root; +	struct btrfs_fs_info *fs_info;  	struct page *locked_page;  	u64 start;  	u64 end; @@ -538,8 +549,7 @@ again:  					   &total_compressed);  		if (!ret) { -			unsigned long offset = total_compressed & -				(PAGE_SIZE - 1); +			unsigned long offset = offset_in_page(total_compressed);  			struct page *page = pages[nr_pages - 1];  			char *kaddr; @@ -847,14 +857,13 @@ retry:  				    ins.offset, async_extent->pages,  				    async_extent->nr_pages,  				    async_cow->write_flags)) { -			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;  			struct page *p = async_extent->pages[0];  			const u64 start = async_extent->start;  			const u64 end = start + async_extent->ram_size - 1;  			p->mapping = inode->i_mapping; -			tree->ops->writepage_end_io_hook(p, start, end, -							 NULL, 0); +			btrfs_writepage_endio_finish_ordered(p, start, end, 0); +  			p->mapping = NULL;  			extent_clear_unlock_delalloc(inode, start, end, end,  						     NULL, 0, @@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)  {  	struct btrfs_fs_info *fs_info;  	struct async_cow *async_cow; -	struct btrfs_root *root;  	unsigned long nr_pages;  	async_cow = container_of(work, struct async_cow, work); -	root = async_cow->root; -	fs_info = root->fs_info; +	fs_info = async_cow->fs_info;  	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>  		PAGE_SHIFT; @@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  {  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	struct async_cow *async_cow; -	struct btrfs_root *root = BTRFS_I(inode)->root;  	unsigned long nr_pages;  	u64 cur_end; @@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);  		BUG_ON(!async_cow); /* -ENOMEM */  		async_cow->inode = igrab(inode); -		async_cow->root = root; +		async_cow->fs_info = fs_info;  		async_cow->locked_page = locked_page;  		async_cow->start = start;  		async_cow->write_flags = write_flags; @@ -1372,7 +1378,8 @@ next_slot:  			 * Do the same check as in btrfs_cross_ref_exist but  			 * without the unnecessary search.  			 */ -			if (btrfs_file_extent_generation(leaf, fi) <= +			if (!nolock && +			    btrfs_file_extent_generation(leaf, fi) <=  			    btrfs_root_last_snapshot(&root->root_item))  				goto out_check;  			if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -1576,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)  }  /* - * extent_io.c call back to do delayed allocation processing + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time.   */ -static int run_delalloc_range(void *private_data, struct page *locked_page, -			      u64 start, u64 end, int *page_started, -			      unsigned long *nr_written, -			      struct writeback_control *wbc) +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +		u64 start, u64 end, int *page_started, unsigned long *nr_written, +		struct writeback_control *wbc)  {  	struct inode *inode = private_data;  	int ret; @@ -1605,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,  					   write_flags);  	}  	if (ret) -		btrfs_cleanup_ordered_extents(inode, start, end - start + 1); +		btrfs_cleanup_ordered_extents(inode, locked_page, start, +					      end - start + 1);  	return ret;  } -static void btrfs_split_extent_hook(void *private_data, -				    struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, +				 struct extent_state *orig, u64 split)  { -	struct inode *inode = private_data;  	u64 size;  	/* not delalloc, ignore it */ @@ -1625,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data,  		u64 new_size;  		/* -		 * See the explanation in btrfs_merge_extent_hook, the same +		 * See the explanation in btrfs_merge_delalloc_extent, the same  		 * applies here, just in reverse.  		 */  		new_size = orig->end - split + 1; @@ -1642,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data,  }  /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need.   */ -static void btrfs_merge_extent_hook(void *private_data, -				    struct extent_state *new, -				    struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +				 struct extent_state *other)  { -	struct inode *inode = private_data;  	u64 new_size, old_size;  	u32 num_extents; @@ -1755,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,  }  /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done.   */ -static void btrfs_set_bit_hook(void *private_data, -			       struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +			       unsigned *bits)  { -	struct inode *inode = private_data; -  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1809,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data,  }  /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens.   */ -static void btrfs_clear_bit_hook(void *private_data, -				 struct extent_state *state, -				 unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, +				 struct extent_state *state, unsigned *bits)  { -	struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); +	struct btrfs_inode *inode = BTRFS_I(vfs_inode); +	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);  	u64 len = state->end + 1 - state->start;  	u32 num_extents = count_max_extents(len); @@ -1841,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data,  		/*  		 * We don't reserve metadata space for space cache inodes so we -		 * don't need to call dellalloc_release_metadata if there is an +		 * don't need to call delalloc_release_metadata if there is an  		 * error.  		 */  		if (*bits & EXTENT_CLEAR_META_RESV && @@ -1880,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data,  }  /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk   * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio + * + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio   * return error otherwise   */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, -			 size_t size, struct bio *bio, -			 unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, +			     unsigned long bio_flags)  {  	struct inode *inode = page->mapping->host;  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1932,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,  }  /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time.   All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, -			  int mirror_num) -{ -	struct inode *inode = private_data; -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	blk_status_t ret; - -	ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); -	if (ret) { -		bio->bi_status = ret; -		bio_endio(bio); -	} -	return ret; -} - -/*   * extent_io.c submission hook. This does the right thing for csum calculation   * on write, or reading the csums from the tree before a read.   * @@ -2056,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,  			      unsigned int extra_bits,  			      struct extent_state **cached_state, int dedupe)  { -	WARN_ON((end & (PAGE_SIZE - 1)) == 0); +	WARN_ON(PAGE_ALIGNED(end));  	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,  				   extra_bits, cached_state);  } @@ -2152,7 +2135,7 @@ out_page:   * to fix it up.  The async helper will wait for ordered extents, set   * the delalloc bit and make it safe to write the page.   */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)  {  	struct inode *inode = page->mapping->host;  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3159,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work)  	btrfs_finish_ordered_io(ordered_extent);  } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, -				struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +					  u64 end, int uptodate)  {  	struct inode *inode = page->mapping->host;  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3686,6 +3669,21 @@ cache_index:  	 * inode is not a directory, logging its parent unnecessarily.  	 */  	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; +	/* +	 * Similar reasoning for last_link_trans, needs to be set otherwise +	 * for a case like the following: +	 * +	 * mkdir A +	 * touch foo +	 * ln foo A/bar +	 * echo 2 > /proc/sys/vm/drop_caches +	 * fsync foo +	 * <power failure> +	 * +	 * Would result in link bar and directory A not existing after the power +	 * failure. +	 */ +	BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;  	path->slots[0]++;  	if (inode->i_nlink != 1 || @@ -4444,31 +4442,6 @@ out:  	return err;  } -static int truncate_space_check(struct btrfs_trans_handle *trans, -				struct btrfs_root *root, -				u64 bytes_deleted) -{ -	struct btrfs_fs_info *fs_info = root->fs_info; -	int ret; - -	/* -	 * This is only used to apply pressure to the enospc system, we don't -	 * intend to use this reservation at all. -	 */ -	bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); -	bytes_deleted *= fs_info->nodesize; -	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, -				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH); -	if (!ret) { -		trace_btrfs_space_reservation(fs_info, "transaction", -					      trans->transid, -					      bytes_deleted, 1); -		trans->bytes_reserved += bytes_deleted; -	} -	return ret; - -} -  /*   * Return this if we need to call truncate_block for the last bit of the   * truncate. @@ -4513,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	u64 bytes_deleted = 0;  	bool be_nice = false;  	bool should_throttle = false; -	bool should_end = false;  	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4544,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	/*  	 * This function is also used to drop the items in the log tree before  	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means -	 * it is used to drop the loged items. So we shouldn't kill the delayed +	 * it is used to drop the logged items. So we shouldn't kill the delayed  	 * items.  	 */  	if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4726,15 +4698,7 @@ delete:  				btrfs_abort_transaction(trans, ret);  				break;  			} -			if (btrfs_should_throttle_delayed_refs(trans)) -				btrfs_async_run_delayed_refs(fs_info, -					trans->delayed_ref_updates * 2, -					trans->transid, 0);  			if (be_nice) { -				if (truncate_space_check(trans, root, -							 extent_num_bytes)) { -					should_end = true; -				}  				if (btrfs_should_throttle_delayed_refs(trans))  					should_throttle = true;  			} @@ -4745,7 +4709,7 @@ delete:  		if (path->slots[0] == 0 ||  		    path->slots[0] != pending_del_slot || -		    should_throttle || should_end) { +		    should_throttle) {  			if (pending_del_nr) {  				ret = btrfs_del_items(trans, root, path,  						pending_del_slot, @@ -4757,23 +4721,24 @@ delete:  				pending_del_nr = 0;  			}  			btrfs_release_path(path); -			if (should_throttle) { -				unsigned long updates = trans->delayed_ref_updates; -				if (updates) { -					trans->delayed_ref_updates = 0; -					ret = btrfs_run_delayed_refs(trans, -								   updates * 2); -					if (ret) -						break; -				} -			} +  			/* -			 * if we failed to refill our space rsv, bail out -			 * and let the transaction restart +			 * We can generate a lot of delayed refs, so we need to +			 * throttle every once and a while and make sure we're +			 * adding enough space to keep up with the work we are +			 * generating.  Since we hold a transaction here we +			 * can't flush, and we don't want to FLUSH_LIMIT because +			 * we could have generated too many delayed refs to +			 * actually allocate, so just bail if we're short and +			 * let the normal reservation dance happen higher up.  			 */ -			if (should_end) { -				ret = -EAGAIN; -				break; +			if (should_throttle) { +				ret = btrfs_delayed_refs_rsv_refill(fs_info, +							BTRFS_RESERVE_NO_FLUSH); +				if (ret) { +					ret = -EAGAIN; +					break; +				}  			}  			goto search_again;  		} else { @@ -4799,18 +4764,6 @@ out:  	}  	btrfs_free_path(path); - -	if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { -		unsigned long updates = trans->delayed_ref_updates; -		int err; - -		if (updates) { -			trans->delayed_ref_updates = 0; -			err = btrfs_run_delayed_refs(trans, updates * 2); -			if (err) -				ret = err; -		} -	}  	return ret;  } @@ -5155,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  		truncate_setsize(inode, newsize); -		/* Disable nonlocked read DIO to avoid the end less truncate */ +		/* Disable nonlocked read DIO to avoid the endless truncate */  		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));  		inode_dio_wait(inode);  		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); @@ -5333,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,  		 * Try to steal from the global reserve if there is space for  		 * it.  		 */ -		if (!btrfs_check_space_for_delayed_refs(trans) && -		    !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) +		if (!btrfs_check_space_for_delayed_refs(fs_info) && +		    !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))  			return trans;  		/* If not, commit and try again. */ @@ -6406,14 +6359,19 @@ fail_dir_item:  		err = btrfs_del_root_ref(trans, key.objectid,  					 root->root_key.objectid, parent_ino,  					 &local_index, name, name_len); - +		if (err) +			btrfs_abort_transaction(trans, err);  	} else if (add_backref) {  		u64 local_index;  		int err;  		err = btrfs_del_inode_ref(trans, root, name, name_len,  					  ino, parent_ino, &local_index); +		if (err) +			btrfs_abort_transaction(trans, err);  	} + +	/* Return the original error code */  	return ret;  } @@ -6625,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  			if (err)  				goto fail;  		} +		BTRFS_I(inode)->last_link_trans = trans->transid;  		d_instantiate(dentry, inode);  		ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,  					 true, NULL); @@ -6652,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	int err = 0; -	int drop_on_err = 0;  	u64 objectid = 0;  	u64 index = 0; @@ -6678,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  		goto out_fail;  	} -	drop_on_err = 1;  	/* these must be set before we unlock the inode */  	inode->i_op = &btrfs_dir_inode_operations;  	inode->i_fop = &btrfs_dir_file_operations; @@ -6699,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  		goto out_fail;  	d_instantiate_new(dentry, inode); -	drop_on_err = 0;  out_fail:  	btrfs_end_transaction(trans); @@ -8053,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio)  	dio_bio->bi_status = err;  	dio_end_io(dio_bio); - -	if (io_bio->end_io) -		io_bio->end_io(io_bio, blk_status_to_errno(err)); +	btrfs_io_bio_free_csum(io_bio);  	bio_put(bio);  } @@ -8098,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode,  			return;  		/*  		 * Our bio might span multiple ordered extents. In this case -		 * we keep goin until we have accounted the whole dio. +		 * we keep going until we have accounted the whole dio.  		 */  		if (ordered_offset < offset + bytes) {  			ordered_bytes = offset + bytes - ordered_offset; @@ -8408,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,  	if (!ret)  		return; -	if (io_bio->end_io) -		io_bio->end_io(io_bio, ret); +	btrfs_io_bio_free_csum(io_bio);  free_ordered:  	/* @@ -8912,7 +8865,7 @@ again:  	/* page is wholly or partially inside EOF */  	if (page_start + PAGE_SIZE > size) -		zero_start = size & ~PAGE_MASK; +		zero_start = offset_in_page(size);  	else  		zero_start = PAGE_SIZE; @@ -9157,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->index_cnt = (u64)-1;  	ei->dir_index = 0;  	ei->last_unlink_trans = 0; +	ei->last_link_trans = 0;  	ei->last_log_commit = 0;  	spin_lock_init(&ei->lock); @@ -9968,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode   * some fairly slow code that needs optimization. This walks the list   * of all the inodes with pending delalloc and forces them to disk.   */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)  {  	struct btrfs_inode *binode;  	struct inode *inode; @@ -9996,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)  		}  		spin_unlock(&root->delalloc_lock); +		if (snapshot) +			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, +				&binode->runtime_flags);  		work = btrfs_alloc_delalloc_work(inode);  		if (!work) {  			iput(inode); @@ -10029,7 +9986,7 @@ out:  	return ret;  } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	int ret; @@ -10037,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))  		return -EROFS; -	ret = start_delalloc_inodes(root, -1); +	ret = start_delalloc_inodes(root, -1, true);  	if (ret > 0)  		ret = 0;  	return ret; @@ -10066,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)  			       &fs_info->delalloc_roots);  		spin_unlock(&fs_info->delalloc_root_lock); -		ret = start_delalloc_inodes(root, nr); +		ret = start_delalloc_inodes(root, nr, false);  		btrfs_put_fs_root(root);  		if (ret < 0)  			goto out; @@ -10445,26 +10402,6 @@ out:  	return ret;  } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ -	return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, -					u64 start, u64 end) -{ -	struct inode *inode = private_data; -	u64 isize; - -	isize = i_size_read(inode); -	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { -		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, -		    "%s: ino %llu isize %llu odd range [%llu,%llu]", -			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); -	} -} -  void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)  {  	struct inode *inode = tree->private_data; @@ -10481,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)  	}  } +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, +				  bool is_block_group) +{ +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +	struct btrfs_swapfile_pin *sp, *entry; +	struct rb_node **p; +	struct rb_node *parent = NULL; + +	sp = kmalloc(sizeof(*sp), GFP_NOFS); +	if (!sp) +		return -ENOMEM; +	sp->ptr = ptr; +	sp->inode = inode; +	sp->is_block_group = is_block_group; + +	spin_lock(&fs_info->swapfile_pins_lock); +	p = &fs_info->swapfile_pins.rb_node; +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct btrfs_swapfile_pin, node); +		if (sp->ptr < entry->ptr || +		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) { +			p = &(*p)->rb_left; +		} else if (sp->ptr > entry->ptr || +			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) { +			p = &(*p)->rb_right; +		} else { +			spin_unlock(&fs_info->swapfile_pins_lock); +			kfree(sp); +			return 1; +		} +	} +	rb_link_node(&sp->node, parent, p); +	rb_insert_color(&sp->node, &fs_info->swapfile_pins); +	spin_unlock(&fs_info->swapfile_pins_lock); +	return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +	struct btrfs_swapfile_pin *sp; +	struct rb_node *node, *next; + +	spin_lock(&fs_info->swapfile_pins_lock); +	node = rb_first(&fs_info->swapfile_pins); +	while (node) { +		next = rb_next(node); +		sp = rb_entry(node, struct btrfs_swapfile_pin, node); +		if (sp->inode == inode) { +			rb_erase(&sp->node, &fs_info->swapfile_pins); +			if (sp->is_block_group) +				btrfs_put_block_group(sp->ptr); +			kfree(sp); +		} +		node = next; +	} +	spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { +	u64 start; +	u64 block_start; +	u64 block_len; +	u64 lowest_ppage; +	u64 highest_ppage; +	unsigned long nr_pages; +	int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, +				 struct btrfs_swap_info *bsi) +{ +	unsigned long nr_pages; +	u64 first_ppage, first_ppage_reported, next_ppage; +	int ret; + +	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; +	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, +				PAGE_SIZE) >> PAGE_SHIFT; + +	if (first_ppage >= next_ppage) +		return 0; +	nr_pages = next_ppage - first_ppage; + +	first_ppage_reported = first_ppage; +	if (bsi->start == 0) +		first_ppage_reported++; +	if (bsi->lowest_ppage > first_ppage_reported) +		bsi->lowest_ppage = first_ppage_reported; +	if (bsi->highest_ppage < (next_ppage - 1)) +		bsi->highest_ppage = next_ppage - 1; + +	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); +	if (ret < 0) +		return ret; +	bsi->nr_extents += ret; +	bsi->nr_pages += nr_pages; +	return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ +	struct inode *inode = file_inode(file); + +	btrfs_free_swapfile_pins(inode); +	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, +			       sector_t *span) +{ +	struct inode *inode = file_inode(file); +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +	struct extent_state *cached_state = NULL; +	struct extent_map *em = NULL; +	struct btrfs_device *device = NULL; +	struct btrfs_swap_info bsi = { +		.lowest_ppage = (sector_t)-1ULL, +	}; +	int ret = 0; +	u64 isize; +	u64 start; + +	/* +	 * If the swap file was just created, make sure delalloc is done. If the +	 * file changes again after this, the user is doing something stupid and +	 * we don't really care. +	 */ +	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); +	if (ret) +		return ret; + +	/* +	 * The inode is locked, so these flags won't change after we check them. +	 */ +	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { +		btrfs_warn(fs_info, "swapfile must not be compressed"); +		return -EINVAL; +	} +	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { +		btrfs_warn(fs_info, "swapfile must not be copy-on-write"); +		return -EINVAL; +	} +	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { +		btrfs_warn(fs_info, "swapfile must not be checksummed"); +		return -EINVAL; +	} + +	/* +	 * Balance or device remove/replace/resize can move stuff around from +	 * under us. The EXCL_OP flag makes sure they aren't running/won't run +	 * concurrently while we are mapping the swap extents, and +	 * fs_info->swapfile_pins prevents them from running while the swap file +	 * is active and moving the extents. Note that this also prevents a +	 * concurrent device add which isn't actually necessary, but it's not +	 * really worth the trouble to allow it. +	 */ +	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { +		btrfs_warn(fs_info, +	   "cannot activate swapfile while exclusive operation is running"); +		return -EBUSY; +	} +	/* +	 * Snapshots can create extents which require COW even if NODATACOW is +	 * set. We use this counter to prevent snapshots. We must increment it +	 * before walking the extents because we don't want a concurrent +	 * snapshot to run after we've already checked the extents. +	 */ +	atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + +	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + +	lock_extent_bits(io_tree, 0, isize - 1, &cached_state); +	start = 0; +	while (start < isize) { +		u64 logical_block_start, physical_block_start; +		struct btrfs_block_group_cache *bg; +		u64 len = isize - start; + +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); +		if (IS_ERR(em)) { +			ret = PTR_ERR(em); +			goto out; +		} + +		if (em->block_start == EXTENT_MAP_HOLE) { +			btrfs_warn(fs_info, "swapfile must not have holes"); +			ret = -EINVAL; +			goto out; +		} +		if (em->block_start == EXTENT_MAP_INLINE) { +			/* +			 * It's unlikely we'll ever actually find ourselves +			 * here, as a file small enough to fit inline won't be +			 * big enough to store more than the swap header, but in +			 * case something changes in the future, let's catch it +			 * here rather than later. +			 */ +			btrfs_warn(fs_info, "swapfile must not be inline"); +			ret = -EINVAL; +			goto out; +		} +		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { +			btrfs_warn(fs_info, "swapfile must not be compressed"); +			ret = -EINVAL; +			goto out; +		} + +		logical_block_start = em->block_start + (start - em->start); +		len = min(len, em->len - (start - em->start)); +		free_extent_map(em); +		em = NULL; + +		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); +		if (ret < 0) { +			goto out; +		} else if (ret) { +			ret = 0; +		} else { +			btrfs_warn(fs_info, +				   "swapfile must not be copy-on-write"); +			ret = -EINVAL; +			goto out; +		} + +		em = btrfs_get_chunk_map(fs_info, logical_block_start, len); +		if (IS_ERR(em)) { +			ret = PTR_ERR(em); +			goto out; +		} + +		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { +			btrfs_warn(fs_info, +				   "swapfile must have single data profile"); +			ret = -EINVAL; +			goto out; +		} + +		if (device == NULL) { +			device = em->map_lookup->stripes[0].dev; +			ret = btrfs_add_swapfile_pin(inode, device, false); +			if (ret == 1) +				ret = 0; +			else if (ret) +				goto out; +		} else if (device != em->map_lookup->stripes[0].dev) { +			btrfs_warn(fs_info, "swapfile must be on one device"); +			ret = -EINVAL; +			goto out; +		} + +		physical_block_start = (em->map_lookup->stripes[0].physical + +					(logical_block_start - em->start)); +		len = min(len, em->len - (logical_block_start - em->start)); +		free_extent_map(em); +		em = NULL; + +		bg = btrfs_lookup_block_group(fs_info, logical_block_start); +		if (!bg) { +			btrfs_warn(fs_info, +			   "could not find block group containing swapfile"); +			ret = -EINVAL; +			goto out; +		} + +		ret = btrfs_add_swapfile_pin(inode, bg, true); +		if (ret) { +			btrfs_put_block_group(bg); +			if (ret == 1) +				ret = 0; +			else +				goto out; +		} + +		if (bsi.block_len && +		    bsi.block_start + bsi.block_len == physical_block_start) { +			bsi.block_len += len; +		} else { +			if (bsi.block_len) { +				ret = btrfs_add_swap_extent(sis, &bsi); +				if (ret) +					goto out; +			} +			bsi.start = start; +			bsi.block_start = physical_block_start; +			bsi.block_len = len; +		} + +		start += len; +	} + +	if (bsi.block_len) +		ret = btrfs_add_swap_extent(sis, &bsi); + +out: +	if (!IS_ERR_OR_NULL(em)) +		free_extent_map(em); + +	unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + +	if (ret) +		btrfs_swap_deactivate(file); + +	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + +	if (ret) +		return ret; + +	if (device) +		sis->bdev = device->bdev; +	*span = bsi.highest_ppage - bsi.lowest_ppage + 1; +	sis->max = bsi.nr_pages; +	sis->pages = bsi.nr_pages - 1; +	sis->highest_bit = bsi.nr_pages - 1; +	return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, +			       sector_t *span) +{ +	return -EOPNOTSUPP; +} +#endif +  static const struct inode_operations btrfs_dir_inode_operations = {  	.getattr	= btrfs_getattr,  	.lookup		= btrfs_lookup, @@ -10523,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = {  	/* mandatory callbacks */  	.submit_bio_hook = btrfs_submit_bio_hook,  	.readpage_end_io_hook = btrfs_readpage_end_io_hook, -	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - -	/* optional callbacks */ -	.fill_delalloc = run_delalloc_range, -	.writepage_end_io_hook = btrfs_writepage_end_io_hook, -	.writepage_start_hook = btrfs_writepage_start_hook, -	.set_bit_hook = btrfs_set_bit_hook, -	.clear_bit_hook = btrfs_clear_bit_hook, -	.merge_extent_hook = btrfs_merge_extent_hook, -	.split_extent_hook = btrfs_split_extent_hook, -	.check_extent_io_range = btrfs_check_extent_io_range,  };  /* @@ -10558,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = {  	.releasepage	= btrfs_releasepage,  	.set_page_dirty	= btrfs_set_page_dirty,  	.error_remove_page = generic_error_remove_page, +	.swap_activate	= btrfs_swap_activate, +	.swap_deactivate = btrfs_swap_deactivate,  };  static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 802a628e9f7d..fab9443f6a42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)  	} else if (fsflags & FS_COMPR_FL) {  		const char *comp; +		if (IS_SWAPFILE(inode)) { +			ret = -ETXTBSY; +			goto out_unlock; +		} +  		binode->flags |= BTRFS_INODE_COMPRESS;  		binode->flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		return -EINVAL; +	if (atomic_read(&root->nr_swapfiles)) { +		btrfs_warn(fs_info, +			   "cannot snapshot subvolume with active swapfile"); +		return -ETXTBSY; +	} +  	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);  	if (!pending_snapshot)  		return -ENOMEM; @@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	wait_event(root->subv_writers->wait,  		   percpu_counter_sum(&root->subv_writers->counter) == 0); -	ret = btrfs_start_delalloc_inodes(root); +	ret = btrfs_start_delalloc_snapshot(root);  	if (ret)  		goto dec_and_free; @@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		}  		inode_lock(inode); -		if (do_compress) -			BTRFS_I(inode)->defrag_compress = compress_type; -		ret = cluster_pages_for_defrag(inode, pages, i, cluster); +		if (IS_SWAPFILE(inode)) { +			ret = -ETXTBSY; +		} else { +			if (do_compress) +				BTRFS_I(inode)->defrag_compress = compress_type; +			ret = cluster_pages_for_defrag(inode, pages, i, cluster); +		}  		if (ret < 0) {  			inode_unlock(inode);  			goto out_ra; @@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,  	}  	rcu_read_unlock(); -	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); +	memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));  	fi_args->nodesize = fs_info->nodesize;  	fi_args->sectorsize = fs_info->sectorsize;  	fi_args->clone_alignment = fs_info->sectorsize; @@ -3191,92 +3206,6 @@ out:  	return ret;  } -static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) -{ -	struct page *page; - -	page = grab_cache_page(inode->i_mapping, index); -	if (!page) -		return ERR_PTR(-ENOMEM); - -	if (!PageUptodate(page)) { -		int ret; - -		ret = btrfs_readpage(NULL, page); -		if (ret) -			return ERR_PTR(ret); -		lock_page(page); -		if (!PageUptodate(page)) { -			unlock_page(page); -			put_page(page); -			return ERR_PTR(-EIO); -		} -		if (page->mapping != inode->i_mapping) { -			unlock_page(page); -			put_page(page); -			return ERR_PTR(-EAGAIN); -		} -	} - -	return page; -} - -static int gather_extent_pages(struct inode *inode, struct page **pages, -			       int num_pages, u64 off) -{ -	int i; -	pgoff_t index = off >> PAGE_SHIFT; - -	for (i = 0; i < num_pages; i++) { -again: -		pages[i] = extent_same_get_page(inode, index + i); -		if (IS_ERR(pages[i])) { -			int err = PTR_ERR(pages[i]); - -			if (err == -EAGAIN) -				goto again; -			pages[i] = NULL; -			return err; -		} -	} -	return 0; -} - -static int lock_extent_range(struct inode *inode, u64 off, u64 len, -			     bool retry_range_locking) -{ -	/* -	 * Do any pending delalloc/csum calculations on inode, one way or -	 * another, and lock file content. -	 * The locking order is: -	 * -	 *   1) pages -	 *   2) range in the inode's io tree -	 */ -	while (1) { -		struct btrfs_ordered_extent *ordered; -		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); -		ordered = btrfs_lookup_first_ordered_extent(inode, -							    off + len - 1); -		if ((!ordered || -		     ordered->file_offset + ordered->len <= off || -		     ordered->file_offset >= off + len) && -		    !test_range_bit(&BTRFS_I(inode)->io_tree, off, -				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) { -			if (ordered) -				btrfs_put_ordered_extent(ordered); -			break; -		} -		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); -		if (ordered) -			btrfs_put_ordered_extent(ordered); -		if (!retry_range_locking) -			return -EAGAIN; -		btrfs_wait_ordered_range(inode, off, len); -	} -	return 0; -} -  static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)  {  	inode_unlock(inode1); @@ -3292,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)  	inode_lock_nested(inode2, I_MUTEX_CHILD);  } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, -				      struct inode *inode2, u64 loff2, u64 len) -{ -	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); -	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, -				    struct inode *inode2, u64 loff2, u64 len, -				    bool retry_range_locking) -{ -	int ret; - -	if (inode1 < inode2) { -		swap(inode1, inode2); -		swap(loff1, loff2); -	} -	ret = lock_extent_range(inode1, loff1, len, retry_range_locking); -	if (ret) -		return ret; -	ret = lock_extent_range(inode2, loff2, len, retry_range_locking); -	if (ret) -		unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, -			      loff1 + len - 1); -	return ret; -} - -struct cmp_pages { -	int		num_pages; -	struct page	**src_pages; -	struct page	**dst_pages; -}; - -static void btrfs_cmp_data_free(struct cmp_pages *cmp) -{ -	int i; -	struct page *pg; - -	for (i = 0; i < cmp->num_pages; i++) { -		pg = cmp->src_pages[i]; -		if (pg) { -			unlock_page(pg); -			put_page(pg); -			cmp->src_pages[i] = NULL; -		} -		pg = cmp->dst_pages[i]; -		if (pg) { -			unlock_page(pg); -			put_page(pg); -			cmp->dst_pages[i] = NULL; -		} -	} -} - -static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, -				  struct inode *dst, u64 dst_loff, -				  u64 len, struct cmp_pages *cmp) -{ -	int ret; -	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - -	cmp->num_pages = num_pages; - -	ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); -	if (ret) -		goto out; - -	ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); - -out: -	if (ret) -		btrfs_cmp_data_free(cmp); -	return ret; -} - -static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -{ -	int ret = 0; -	int i; -	struct page *src_page, *dst_page; -	unsigned int cmp_len = PAGE_SIZE; -	void *addr, *dst_addr; - -	i = 0; -	while (len) { -		if (len < PAGE_SIZE) -			cmp_len = len; - -		BUG_ON(i >= cmp->num_pages); - -		src_page = cmp->src_pages[i]; -		dst_page = cmp->dst_pages[i]; -		ASSERT(PageLocked(src_page)); -		ASSERT(PageLocked(dst_page)); - -		addr = kmap_atomic(src_page); -		dst_addr = kmap_atomic(dst_page); - -		flush_dcache_page(src_page); -		flush_dcache_page(dst_page); - -		if (memcmp(addr, dst_addr, cmp_len)) -			ret = -EBADE; - -		kunmap_atomic(addr); -		kunmap_atomic(dst_addr); - -		if (ret) -			break; - -		len -= cmp_len; -		i++; -	} - -	return ret; -} - -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, -				     u64 olen) -{ -	u64 len = *plen; -	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - -	if (off + olen > inode->i_size || off + olen < off) -		return -EINVAL; - -	/* if we extend to eof, continue to block boundary */ -	if (off + len == inode->i_size) -		*plen = len = ALIGN(inode->i_size, bs) - off; - -	/* Check that we are block aligned - btrfs_clone() requires this */ -	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) -		return -EINVAL; - -	return 0; -} -  static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, -				   struct inode *dst, u64 dst_loff, -				   struct cmp_pages *cmp) +				   struct inode *dst, u64 dst_loff)  { +	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;  	int ret;  	u64 len = olen; -	bool same_inode = (src == dst); -	u64 same_lock_start = 0; -	u64 same_lock_len = 0; - -	ret = extent_same_check_offsets(src, loff, &len, olen); -	if (ret) -		return ret; - -	ret = extent_same_check_offsets(dst, dst_loff, &len, olen); -	if (ret) -		return ret; - -	if (same_inode) { -		/* -		 * Single inode case wants the same checks, except we -		 * don't want our length pushed out past i_size as -		 * comparing that data range makes no sense. -		 * -		 * extent_same_check_offsets() will do this for an -		 * unaligned length at i_size, so catch it here and -		 * reject the request. -		 * -		 * This effectively means we require aligned extents -		 * for the single-inode case, whereas the other cases -		 * allow an unaligned length so long as it ends at -		 * i_size. -		 */ -		if (len != olen) -			return -EINVAL; - -		/* Check for overlapping ranges */ -		if (dst_loff + len > loff && dst_loff < loff + len) -			return -EINVAL; - -		same_lock_start = min_t(u64, loff, dst_loff); -		same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; -	} else { -		/* -		 * If the source and destination inodes are different, the -		 * source's range end offset matches the source's i_size, that -		 * i_size is not a multiple of the sector size, and the -		 * destination range does not go past the destination's i_size, -		 * we must round down the length to the nearest sector size -		 * multiple. If we don't do this adjustment we end replacing -		 * with zeroes the bytes in the range that starts at the -		 * deduplication range's end offset and ends at the next sector -		 * size multiple. -		 */ -		if (loff + olen == i_size_read(src) && -		    dst_loff + len < i_size_read(dst)) { -			const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; - -			len = round_down(i_size_read(src), sz) - loff; -			if (len == 0) -				return 0; -			olen = len; -		} -	} - -again: -	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); -	if (ret) -		return ret; -	if (same_inode) -		ret = lock_extent_range(src, same_lock_start, same_lock_len, -					false); -	else -		ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, -					       false); +	if (loff + len == src->i_size) +		len = ALIGN(src->i_size, bs) - loff;  	/* -	 * If one of the inodes has dirty pages in the respective range or -	 * ordered extents, we need to flush dellaloc and wait for all ordered -	 * extents in the range. We must unlock the pages and the ranges in the -	 * io trees to avoid deadlocks when flushing delalloc (requires locking -	 * pages) and when waiting for ordered extents to complete (they require -	 * range locking). +	 * For same inode case we don't want our length pushed out past i_size +	 * as comparing that data range makes no sense. +	 * +	 * This effectively means we require aligned extents for the single +	 * inode case, whereas the other cases allow an unaligned length so long +	 * as it ends at i_size.  	 */ -	if (ret == -EAGAIN) { -		/* -		 * Ranges in the io trees already unlocked. Now unlock all -		 * pages before waiting for all IO to complete. -		 */ -		btrfs_cmp_data_free(cmp); -		if (same_inode) { -			btrfs_wait_ordered_range(src, same_lock_start, -						 same_lock_len); -		} else { -			btrfs_wait_ordered_range(src, loff, len); -			btrfs_wait_ordered_range(dst, dst_loff, len); -		} -		goto again; -	} -	ASSERT(ret == 0); -	if (WARN_ON(ret)) { -		/* ranges in the io trees already unlocked */ -		btrfs_cmp_data_free(cmp); -		return ret; -	} - -	/* pass original length for comparison so we stay within i_size */ -	ret = btrfs_cmp_data(olen, cmp); -	if (ret == 0) -		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - -	if (same_inode) -		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, -			      same_lock_start + same_lock_len - 1); -	else -		btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); +	if (dst == src && len != olen) +		return -EINVAL; -	btrfs_cmp_data_free(cmp); +	/* +	 * Lock destination range to serialize with concurrent readpages(). +	 */ +	lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); +	ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); +	unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);  	return ret;  } @@ -3557,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,  			     struct inode *dst, u64 dst_loff)  {  	int ret; -	struct cmp_pages cmp;  	int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; -	bool same_inode = (src == dst);  	u64 i, tail_len, chunk_count; -	if (olen == 0) -		return 0; - -	if (same_inode) -		inode_lock(src); -	else -		btrfs_double_inode_lock(src, dst); -  	/* don't make the dst file partly checksummed */  	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != -	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { -		ret = -EINVAL; -		goto out_unlock; -	} +	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) +		return -EINVAL; + +	if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) +		return -ETXTBSY;  	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;  	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);  	if (chunk_count == 0)  		num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; -	/* -	 * If deduping ranges in the same inode, locking rules make it -	 * mandatory to always lock pages in ascending order to avoid deadlocks -	 * with concurrent tasks (such as starting writeback/delalloc). -	 */ -	if (same_inode && dst_loff < loff) -		swap(loff, dst_loff); - -	/* -	 * We must gather up all the pages before we initiate our extent -	 * locking. We use an array for the page pointers. Size of the array is -	 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. -	 */ -	cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), -				       GFP_KERNEL | __GFP_ZERO); -	cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), -				       GFP_KERNEL | __GFP_ZERO); -	if (!cmp.src_pages || !cmp.dst_pages) { -		ret = -ENOMEM; -		goto out_free; -	} -  	for (i = 0; i < chunk_count; i++) {  		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, -					      dst, dst_loff, &cmp); +					      dst, dst_loff);  		if (ret) -			goto out_free; +			return ret;  		loff += BTRFS_MAX_DEDUPE_LEN;  		dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3616,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,  	if (tail_len > 0)  		ret = btrfs_extent_same_range(src, loff, tail_len, dst, -					      dst_loff, &cmp); - -out_free: -	kvfree(cmp.src_pages); -	kvfree(cmp.dst_pages); - -out_unlock: -	if (same_inode) -		inode_unlock(src); -	else -		btrfs_double_inode_unlock(src, dst); +					      dst_loff);  	return ret;  } @@ -4213,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,  	struct inode *inode = file_inode(file);  	struct inode *src = file_inode(file_src);  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret;  	u64 len = olen;  	u64 bs = fs_info->sb->s_blocksize; -	int same_inode = src == inode;  	/*  	 * TODO: @@ -4230,101 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,  	 *   be either compressed or non-compressed.  	 */ -	if (btrfs_root_readonly(root)) -		return -EROFS; - -	if (file_src->f_path.mnt != file->f_path.mnt || -	    src->i_sb != inode->i_sb) -		return -EXDEV; - -	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) -		return -EISDIR; - -	if (!same_inode) { -		btrfs_double_inode_lock(src, inode); -	} else { -		inode_lock(src); -	} -  	/* don't make the dst file partly checksummed */  	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != -	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { -		ret = -EINVAL; -		goto out_unlock; -	} +	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) +		return -EINVAL; + +	if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) +		return -ETXTBSY; -	/* determine range to clone */ -	ret = -EINVAL; -	if (off + len > src->i_size || off + len < off) -		goto out_unlock; -	if (len == 0) -		olen = len = src->i_size - off;  	/* -	 * If we extend to eof, continue to block boundary if and only if the -	 * destination end offset matches the destination file's size, otherwise -	 * we would be corrupting data by placing the eof block into the middle -	 * of a file. +	 * VFS's generic_remap_file_range_prep() protects us from cloning the +	 * eof block into the middle of a file, which would result in corruption +	 * if the file size is not blocksize aligned. So we don't need to check +	 * for that case here.  	 */ -	if (off + len == src->i_size) { -		if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) -			goto out_unlock; +	if (off + len == src->i_size)  		len = ALIGN(src->i_size, bs) - off; -	} - -	if (len == 0) { -		ret = 0; -		goto out_unlock; -	} - -	/* verify the end result is block aligned */ -	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || -	    !IS_ALIGNED(destoff, bs)) -		goto out_unlock; - -	/* verify if ranges are overlapped within the same file */ -	if (same_inode) { -		if (destoff + len > off && destoff < off + len) -			goto out_unlock; -	}  	if (destoff > inode->i_size) {  		ret = btrfs_cont_expand(inode, inode->i_size, destoff);  		if (ret) -			goto out_unlock; +			return ret;  	}  	/* -	 * Lock the target range too. Right after we replace the file extent -	 * items in the fs tree (which now point to the cloned data), we might -	 * have a worker replace them with extent items relative to a write -	 * operation that was issued before this clone operation (i.e. confront -	 * with inode.c:btrfs_finish_ordered_io). +	 * Lock destination range to serialize with concurrent readpages().  	 */ -	if (same_inode) { -		u64 lock_start = min_t(u64, off, destoff); -		u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - -		ret = lock_extent_range(src, lock_start, lock_len, true); -	} else { -		ret = btrfs_double_extent_lock(src, off, inode, destoff, len, -					       true); -	} -	ASSERT(ret == 0); -	if (WARN_ON(ret)) { -		/* ranges in the io trees already unlocked */ -		goto out_unlock; -	} - +	lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);  	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - -	if (same_inode) { -		u64 lock_start = min_t(u64, off, destoff); -		u64 lock_end = max_t(u64, off, destoff) + len - 1; - -		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); -	} else { -		btrfs_double_extent_unlock(src, off, inode, destoff, len); -	} +	unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);  	/*  	 * Truncate page cache pages so that future reads will see the cloned  	 * data immediately and not the previous data. @@ -4332,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,  	truncate_inode_pages_range(&inode->i_data,  				round_down(destoff, PAGE_SIZE),  				round_up(destoff + len, PAGE_SIZE) - 1); -out_unlock: + +	return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, +				       struct file *file_out, loff_t pos_out, +				       loff_t *len, unsigned int remap_flags) +{ +	struct inode *inode_in = file_inode(file_in); +	struct inode *inode_out = file_inode(file_out); +	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; +	bool same_inode = inode_out == inode_in; +	u64 wb_len; +	int ret; + +	if (!(remap_flags & REMAP_FILE_DEDUP)) { +		struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + +		if (btrfs_root_readonly(root_out)) +			return -EROFS; + +		if (file_in->f_path.mnt != file_out->f_path.mnt || +		    inode_in->i_sb != inode_out->i_sb) +			return -EXDEV; +	} + +	if (same_inode) +		inode_lock(inode_in); +	else +		btrfs_double_inode_lock(inode_in, inode_out); + +	/* +	 * Now that the inodes are locked, we need to start writeback ourselves +	 * and can not rely on the writeback from the VFS's generic helper +	 * generic_remap_file_range_prep() because: +	 * +	 * 1) For compression we must call filemap_fdatawrite_range() range +	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic +	 *    helper only calls it once; +	 * +	 * 2) filemap_fdatawrite_range(), called by the generic helper only +	 *    waits for the writeback to complete, i.e. for IO to be done, and +	 *    not for the ordered extents to complete. We need to wait for them +	 *    to complete so that new file extent items are in the fs tree. +	 */ +	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) +		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); +	else +		wb_len = ALIGN(*len, bs); + +	/* +	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as +	 * any in progress could create its ordered extents after we wait for +	 * existing ordered extents below). +	 */ +	inode_dio_wait(inode_in);  	if (!same_inode) -		btrfs_double_inode_unlock(src, inode); +		inode_dio_wait(inode_out); + +	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), +				       wb_len); +	if (ret < 0) +		goto out_unlock; +	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), +				       wb_len); +	if (ret < 0) +		goto out_unlock; + +	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, +					    len, remap_flags); +	if (ret < 0 || *len == 0) +		goto out_unlock; + +	return 0; + + out_unlock: +	if (same_inode) +		inode_unlock(inode_in);  	else -		inode_unlock(src); +		btrfs_double_inode_unlock(inode_in, inode_out); +  	return ret;  } @@ -4344,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,  		struct file *dst_file, loff_t destoff, loff_t len,  		unsigned int remap_flags)  { +	struct inode *src_inode = file_inode(src_file); +	struct inode *dst_inode = file_inode(dst_file); +	bool same_inode = dst_inode == src_inode;  	int ret;  	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))  		return -EINVAL; -	if (remap_flags & REMAP_FILE_DEDUP) { -		struct inode *src = file_inode(src_file); -		struct inode *dst = file_inode(dst_file); -		u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - -		if (WARN_ON_ONCE(bs < PAGE_SIZE)) { -			/* -			 * Btrfs does not support blocksize < page_size. As a -			 * result, btrfs_cmp_data() won't correctly handle -			 * this situation without an update. -			 */ -			return -EINVAL; -		} +	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, +					  &len, remap_flags); +	if (ret < 0 || len == 0) +		return ret; -		ret = btrfs_extent_same(src, off, len, dst, destoff); -	} else { +	if (remap_flags & REMAP_FILE_DEDUP) +		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); +	else  		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); -	} + +	if (same_inode) +		inode_unlock(src_inode); +	else +		btrfs_double_inode_unlock(src_inode, dst_inode); +  	return ret < 0 ? ret : len;  } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a4cc178bee..90639140439f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -27,7 +27,7 @@   *     Records the total size (including the header) of compressed data.   *   * 2.  Segment(s) - *     Variable size. Each segment includes one segment header, followd by data + *     Variable size. Each segment includes one segment header, followed by data   *     payload.   *     One regular LZO compressed extent can have one or more segments.   *     For inlined LZO compressed extent, only one segment is allowed. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..6fde2b2741ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,  	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);  	struct btrfs_root *root = btrfs_inode->root;  	struct rb_node *node; -	bool dec_pending_ordered = false;  	/* This is paired with btrfs_add_ordered_extent. */  	spin_lock(&btrfs_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,  	if (tree->last == node)  		tree->last = NULL;  	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); -	if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) -		dec_pending_ordered = true;  	spin_unlock_irq(&tree->lock); -	/* -	 * The current running transaction is waiting on us, we need to let it -	 * know that we're complete and wake it up. -	 */ -	if (dec_pending_ordered) { -		struct btrfs_transaction *trans; - -		/* -		 * The checks for trans are just a formality, it should be set, -		 * but if it isn't we don't want to deref/assert under the spin -		 * lock, so be nice and check if trans is set, but ASSERT() so -		 * if it isn't set a developer will notice. -		 */ -		spin_lock(&fs_info->trans_lock); -		trans = fs_info->running_transaction; -		if (trans) -			refcount_inc(&trans->use_count); -		spin_unlock(&fs_info->trans_lock); - -		ASSERT(trans); -		if (trans) { -			if (atomic_dec_and_test(&trans->pending_ordered)) -				wake_up(&trans->pending_wait); -			btrfs_put_transaction(trans); -		} -	} -  	spin_lock(&root->ordered_extent_lock);  	list_del_init(&entry->root_extent_list);  	root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..fb9a161f0215 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -37,28 +37,31 @@ struct btrfs_ordered_sum {   * rbtree, just before waking any waiters.  It is used to indicate the   * IO is done and any metadata is inserted into the tree.   */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ - -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent -				       * has done its due diligence in updating -				       * the isize. */ -#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ - -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to -				  * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ +enum { +	/* set when all the pages are written */ +	BTRFS_ORDERED_IO_DONE, +	/* set when removed from the tree */ +	BTRFS_ORDERED_COMPLETE, +	/* set when we want to write in place */ +	BTRFS_ORDERED_NOCOW, +	/* writing a zlib compressed extent */ +	BTRFS_ORDERED_COMPRESSED, +	/* set when writing to preallocated extent */ +	BTRFS_ORDERED_PREALLOC, +	/* set when we're doing DIO with this extent */ +	BTRFS_ORDERED_DIRECT, +	/* We had an io error when writing this out */ +	BTRFS_ORDERED_IOERR, +	/* +	 * indicates whether this ordered extent has done its due diligence in +	 * updating the isize +	 */ +	BTRFS_ORDERED_UPDATED_ISIZE, +	/* Set when we have to truncate an extent */ +	BTRFS_ORDERED_TRUNCATED, +	/* Regular IO for COW */ +	BTRFS_ORDERED_REGULAR, +};  struct btrfs_ordered_extent {  	/* logical offset in the file */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f70825af6438..4e473a998219 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@   *  - sync   *  - copy also limits on subvol creation   *  - limit - *  - caches fuer ulists + *  - caches for ulists   *  - performance benchmarks   *  - check all ioctl parameters   */ @@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)  		__del_qgroup_rb(qgroup);  	}  	/* -	 * we call btrfs_free_qgroup_config() when umounting +	 * We call btrfs_free_qgroup_config() when unmounting  	 * filesystem and disabling quota, so we set qgroup_ulist  	 * to be null here to avoid double free.  	 */ @@ -1013,16 +1013,22 @@ out_add_root:  		btrfs_abort_transaction(trans, ret);  		goto out_free_path;  	} -	spin_lock(&fs_info->qgroup_lock); -	fs_info->quota_root = quota_root; -	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); -	spin_unlock(&fs_info->qgroup_lock);  	ret = btrfs_commit_transaction(trans);  	trans = NULL;  	if (ret)  		goto out_free_path; +	/* +	 * Set quota enabled flag after committing the transaction, to avoid +	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot +	 * creation. +	 */ +	spin_lock(&fs_info->qgroup_lock); +	fs_info->quota_root = quota_root; +	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); +	spin_unlock(&fs_info->qgroup_lock); +  	ret = qgroup_rescan_init(fs_info, 0, 1);  	if (!ret) {  	        qgroup_rescan_zero_tracking(fs_info); @@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,   * The easy accounting, we're updating qgroup relationship whose child qgroup   * only has exclusive extents.   * - * In this case, all exclsuive extents will also be exlusive for parent, so + * In this case, all exclusive extents will also be exclusive for parent, so   * excl/rfer just get added/removed.   *   * So is qgroup reservation space, which should also be added/removed to @@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)   *   * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty   *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. - *    They should be marked during preivous (@dst_level = 1) iteration. + *    They should be marked during previous (@dst_level = 1) iteration.   *   * 3) Mark file extents in leaves dirty   *    We don't have good way to pick out new file extents only.   *    So we still follow the old method by scanning all file extents in   *    the leave.   * - * This function can free us from keeping two pathes, thus later we only need + * This function can free us from keeping two paths, thus later we only need   * to care about how to iterate all new tree blocks in reloc tree.   */  static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, @@ -1895,7 +1901,7 @@ out:   *   * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace   * above tree blocks along with their counter parts in file tree. - * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * While during search, old tree blocks OO(c) will be skipped as tree block swap   * won't affect OO(c).   */  static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, @@ -2020,7 +2026,7 @@ out:   * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and   * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,   * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,   * and skip all tree blocks whose generation is smaller than last_snapshot.   *   * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), @@ -3104,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,  		mutex_unlock(&fs_info->qgroup_rescan_lock);  		goto out;  	} -	extent_buffer_get(scratch_leaf); -	btrfs_tree_read_lock(scratch_leaf); -	btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);  	slot = path->slots[0];  	btrfs_release_path(path);  	mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3132,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,  			goto out;  	}  out: -	if (scratch_leaf) { -		btrfs_tree_read_unlock_blocking(scratch_leaf); +	if (scratch_leaf)  		free_extent_buffer(scratch_leaf); -	}  	if (done && !ret) {  		ret = 1; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d8f78f5ab854..20c6bd5fa701 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record {   *	be converted into META_PERTRANS.   */  enum btrfs_qgroup_rsv_type { -	BTRFS_QGROUP_RSV_DATA = 0, +	BTRFS_QGROUP_RSV_DATA,  	BTRFS_QGROUP_RSV_META_PERTRANS,  	BTRFS_QGROUP_RSV_META_PREALLOC,  	BTRFS_QGROUP_RSV_LAST, @@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type {   *   * Each type should have different reservation behavior.   * E.g, data follows its io_tree flag modification, while - * *currently* meta is just reserve-and-clear during transcation. + * *currently* meta is just reserve-and-clear during transaction.   *   * TODO: Add new type for reservation which can survive transaction commit. - * Currect metadata reservation behavior is not suitable for such case. + * Current metadata reservation behavior is not suitable for such case.   */  struct btrfs_qgroup_rsv {  	u64 values[BTRFS_QGROUP_RSV_LAST]; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..e74455eb42f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1980,7 +1980,7 @@ cleanup_io:  		 * - In case of single failure, where rbio->failb == -1:  		 *  		 *   Cache this rbio iff the above read reconstruction is -		 *   excuted without problems. +		 *   executed without problems.  		 */  		if (err == BLK_STS_OK && rbio->failb < 0)  			cache_rbio_pages(rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dec14b739b10..10d9589001a9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,  		goto error;  	} +	/* Insert extent in reada tree + all per-device trees, all or nothing */ +	down_read(&fs_info->dev_replace.rwsem);  	ret = radix_tree_preload(GFP_KERNEL); -	if (ret) +	if (ret) { +		up_read(&fs_info->dev_replace.rwsem);  		goto error; +	} -	/* insert extent in reada_tree + all per-device trees, all or nothing */ -	btrfs_dev_replace_read_lock(&fs_info->dev_replace);  	spin_lock(&fs_info->reada_lock);  	ret = radix_tree_insert(&fs_info->reada_tree, index, re);  	if (ret == -EEXIST) {  		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);  		re_exist->refcnt++;  		spin_unlock(&fs_info->reada_lock); -		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);  		radix_tree_preload_end(); +		up_read(&fs_info->dev_replace.rwsem);  		goto error;  	}  	if (ret) {  		spin_unlock(&fs_info->reada_lock); -		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);  		radix_tree_preload_end(); +		up_read(&fs_info->dev_replace.rwsem);  		goto error;  	}  	radix_tree_preload_end(); @@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,  			}  			radix_tree_delete(&fs_info->reada_tree, index);  			spin_unlock(&fs_info->reada_lock); -			btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +			up_read(&fs_info->dev_replace.rwsem);  			goto error;  		}  		have_zone = 1;  	}  	spin_unlock(&fs_info->reada_lock); -	btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +	up_read(&fs_info->dev_replace.rwsem);  	if (!have_zone)  		goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d69fbfb30aa9..c3557c12656b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -43,7 +43,7 @@ struct ref_entry {   * back to the delayed ref action.  We hold the ref we are changing in the   * action so we can account for the history properly, and we record the root we   * were called with since it could be different from ref_root.  We also store - * stack traces because thats how I roll. + * stack traces because that's how I roll.   */  struct ref_action {  	int action; @@ -56,7 +56,7 @@ struct ref_action {  /*   * One of these for every block we reference, it holds the roots and references - * to it as well as all of the ref actions that have occured to it.  We never + * to it as well as all of the ref actions that have occurred to it.  We never   * free it until we unmount the file system in order to make sure re-allocations   * are happening properly.   */ @@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,  			 * This shouldn't happen because we will add our re  			 * above when we lookup the be with !parent, but just in  			 * case catch this case so we don't panic because I -			 * didn't thik of some other corner case. +			 * didn't think of some other corner case.  			 */  			btrfs_err(fs_info, "failed to find root %llu for %llu",  				  root->root_key.objectid, be->bytenr); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a3f75b8926d4..272b287f8cf0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,  		 * only one thread can access block_rsv at this point,  		 * so we don't need hold lock to protect block_rsv.  		 * we expand more reservation size here to allow enough -		 * space for relocation and we will return eailer in +		 * space for relocation and we will return earlier in  		 * enospc case.  		 */  		rc->block_rsv->size = tmp + fs_info->nodesize * @@ -4185,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void)  static void describe_relocation(struct btrfs_fs_info *fs_info,  				struct btrfs_block_group_cache *block_group)  { -	char buf[128];		/* prefixed by a '|' that'll be dropped */ -	u64 flags = block_group->flags; +	char buf[128] = {'\0'}; -	/* Shouldn't happen */ -	if (!flags) { -		strcpy(buf, "|NONE"); -	} else { -		char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ -		if (flags & BTRFS_BLOCK_GROUP_##f) { \ -			bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ -			flags &= ~BTRFS_BLOCK_GROUP_##f; \ -		} -		DESCRIBE_FLAG(DATA,     "data"); -		DESCRIBE_FLAG(SYSTEM,   "system"); -		DESCRIBE_FLAG(METADATA, "metadata"); -		DESCRIBE_FLAG(RAID0,    "raid0"); -		DESCRIBE_FLAG(RAID1,    "raid1"); -		DESCRIBE_FLAG(DUP,      "dup"); -		DESCRIBE_FLAG(RAID10,   "raid10"); -		DESCRIBE_FLAG(RAID5,    "raid5"); -		DESCRIBE_FLAG(RAID6,    "raid6"); -		if (flags) -			snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG -	} +	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));  	btrfs_info(fs_info,  		   "relocating block group %llu flags %s", -		   block_group->key.objectid, buf + 1); +		   block_group->key.objectid, buf);  }  /* @@ -4223,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,   */  int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)  { +	struct btrfs_block_group_cache *bg;  	struct btrfs_root *extent_root = fs_info->extent_root;  	struct reloc_control *rc;  	struct inode *inode; @@ -4231,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)  	int rw = 0;  	int err = 0; +	bg = btrfs_lookup_block_group(fs_info, group_start); +	if (!bg) +		return -ENOENT; + +	if (btrfs_pinned_by_swapfile(fs_info, bg)) { +		btrfs_put_block_group(bg); +		return -ETXTBSY; +	} +  	rc = alloc_reloc_control(); -	if (!rc) +	if (!rc) { +		btrfs_put_block_group(bg);  		return -ENOMEM; +	}  	rc->extent_root = extent_root; - -	rc->block_group = btrfs_lookup_block_group(fs_info, group_start); -	BUG_ON(!rc->block_group); +	rc->block_group = bg;  	ret = btrfs_inc_block_group_ro(rc->block_group);  	if (ret) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..6dcd36d7b849 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock(  		}  	} -	/* Insert new lock */ +	/* +	 * Insert new lock. +	 */  	ret = kmalloc(sizeof(*ret), GFP_KERNEL);  	if (!ret)  		return ERR_PTR(-ENOMEM); @@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)  		scrub_free_ctx(sctx);  } -static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( +		struct btrfs_fs_info *fs_info, int is_dev_replace)  {  	struct scrub_ctx *sctx;  	int		i; -	struct btrfs_fs_info *fs_info = dev->fs_info;  	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);  	if (!sctx) @@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)  	sctx->is_dev_replace = is_dev_replace;  	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;  	sctx->curr = -1; -	sctx->fs_info = dev->fs_info; +	sctx->fs_info = fs_info;  	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {  		struct scrub_bio *sbio; @@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	int page_num;  	int success;  	bool full_stripe_locked; +	unsigned int nofs_flag;  	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,  				      DEFAULT_RATELIMIT_BURST); @@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	dev = sblock_to_check->pagev[0]->dev;  	/* +	 * We must use GFP_NOFS because the scrub task might be waiting for a +	 * worker task executing this function and in turn a transaction commit +	 * might be waiting the scrub task to pause (which needs to wait for all +	 * the worker tasks to complete before pausing). +	 * We do allocations in the workers through insert_full_stripe_lock() +	 * and scrub_add_page_to_wr_bio(), which happens down the call chain of +	 * this function. +	 */ +	nofs_flag = memalloc_nofs_save(); +	/*  	 * For RAID5/6, race can happen for a different device scrub thread.  	 * For data corruption, Parity and Data threads will both try  	 * to recovery the data. @@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	 */  	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);  	if (ret < 0) { +		memalloc_nofs_restore(nofs_flag);  		spin_lock(&sctx->stat_lock);  		if (ret == -ENOMEM)  			sctx->stat.malloc_errors++; @@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	 */  	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, -				      sizeof(*sblocks_for_recheck), GFP_NOFS); +				      sizeof(*sblocks_for_recheck), GFP_KERNEL);  	if (!sblocks_for_recheck) {  		spin_lock(&sctx->stat_lock);  		sctx->stat.malloc_errors++; @@ -1202,6 +1215,7 @@ out:  	}  	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); +	memalloc_nofs_restore(nofs_flag);  	if (ret < 0)  		return ret;  	return 0; @@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		if (!ret && sctx->is_dev_replace) {  			/*  			 * If we are doing a device replace wait for any tasks -			 * that started dellaloc right before we set the block +			 * that started delalloc right before we set the block  			 * group to RO mode, as they might have just allocated  			 * an extent from it or decided they could do a nocow  			 * write. And if any such tasks did that, wait for their @@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  			break;  		} -		btrfs_dev_replace_write_lock(&fs_info->dev_replace); +		down_write(&fs_info->dev_replace.rwsem);  		dev_replace->cursor_right = found_key.offset + length;  		dev_replace->cursor_left = found_key.offset;  		dev_replace->item_needs_writeback = 1; -		btrfs_dev_replace_write_unlock(&fs_info->dev_replace); +		up_write(&dev_replace->rwsem); +  		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,  				  found_key.offset, cache); @@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		scrub_pause_off(fs_info); -		btrfs_dev_replace_write_lock(&fs_info->dev_replace); +		down_write(&fs_info->dev_replace.rwsem);  		dev_replace->cursor_left = dev_replace->cursor_right;  		dev_replace->item_needs_writeback = 1; -		btrfs_dev_replace_write_unlock(&fs_info->dev_replace); +		up_write(&fs_info->dev_replace.rwsem);  		if (ro_set)  			btrfs_dec_block_group_ro(cache); @@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	struct scrub_ctx *sctx;  	int ret;  	struct btrfs_device *dev; +	unsigned int nofs_flag;  	if (btrfs_fs_closing(fs_info))  		return -EINVAL; @@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		return -EINVAL;  	} +	/* Allocate outside of device_list_mutex */ +	sctx = scrub_setup_ctx(fs_info, is_dev_replace); +	if (IS_ERR(sctx)) +		return PTR_ERR(sctx);  	mutex_lock(&fs_info->fs_devices->device_list_mutex);  	dev = btrfs_find_device(fs_info, devid, NULL, NULL);  	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&  		     !is_dev_replace)) {  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		return -ENODEV; +		ret = -ENODEV; +		goto out_free_ctx;  	}  	if (!is_dev_replace && !readonly && @@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		mutex_unlock(&fs_info->fs_devices->device_list_mutex);  		btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",  				rcu_str_deref(dev->name)); -		return -EROFS; +		ret = -EROFS; +		goto out_free_ctx;  	}  	mutex_lock(&fs_info->scrub_lock); @@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {  		mutex_unlock(&fs_info->scrub_lock);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		return -EIO; +		ret = -EIO; +		goto out_free_ctx;  	} -	btrfs_dev_replace_read_lock(&fs_info->dev_replace); +	down_read(&fs_info->dev_replace.rwsem);  	if (dev->scrub_ctx ||  	    (!is_dev_replace &&  	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { -		btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +		up_read(&fs_info->dev_replace.rwsem);  		mutex_unlock(&fs_info->scrub_lock);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		return -EINPROGRESS; +		ret = -EINPROGRESS; +		goto out_free_ctx;  	} -	btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +	up_read(&fs_info->dev_replace.rwsem);  	ret = scrub_workers_get(fs_info, is_dev_replace);  	if (ret) {  		mutex_unlock(&fs_info->scrub_lock);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		return ret; +		goto out_free_ctx;  	} -	sctx = scrub_setup_ctx(dev, is_dev_replace); -	if (IS_ERR(sctx)) { -		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(fs_info); -		return PTR_ERR(sctx); -	}  	sctx->readonly = readonly;  	dev->scrub_ctx = sctx;  	mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	atomic_inc(&fs_info->scrubs_running);  	mutex_unlock(&fs_info->scrub_lock); +	/* +	 * In order to avoid deadlock with reclaim when there is a transaction +	 * trying to pause scrub, make sure we use GFP_NOFS for all the +	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() +	 * invoked by our callees. The pausing request is done when the +	 * transaction commit starts, and it blocks the transaction until scrub +	 * is paused (done at specific points at scrub_stripe() or right above +	 * before incrementing fs_info->scrubs_running). +	 */ +	nofs_flag = memalloc_nofs_save();  	if (!is_dev_replace) {  		/*  		 * by holding device list mutex, we can @@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	if (!ret)  		ret = scrub_enumerate_chunks(sctx, dev, start, end); +	memalloc_nofs_restore(nofs_flag);  	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);  	atomic_dec(&fs_info->scrubs_running); @@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	scrub_put_ctx(sctx);  	return ret; + +out_free_ctx: +	scrub_free_ctx(sctx); + +	return ret;  }  void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5be83b5a1b43..1b15b43905f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2238,7 +2238,7 @@ out:   * inodes "orphan" name instead of the real name and stop. Same with new inodes   * that were not created yet and overwritten inodes/refs.   * - * When do we have have orphan inodes: + * When do we have orphan inodes:   * 1. When an inode is freshly created and thus no valid refs are available yet   * 2. When a directory lost all it's refs (deleted) but still has dir items   *    inside which were not processed yet (pending for move/delete). If anyone @@ -3854,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  		/*  		 * We may have refs where the parent directory does not exist  		 * yet. This happens if the parent directories inum is higher -		 * the the current inum. To handle this case, we create the +		 * than the current inum. To handle this case, we create the  		 * parent directory out of order. But we need to check if this  		 * did already happen before due to other refs in the same dir.  		 */ @@ -4775,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)  	struct btrfs_key key;  	pgoff_t index = offset >> PAGE_SHIFT;  	pgoff_t last_index; -	unsigned pg_offset = offset & ~PAGE_MASK; +	unsigned pg_offset = offset_in_page(offset);  	ssize_t ret = 0;  	key.objectid = sctx->cur_ino; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 645fc81e2a94..368a5b9e6c13 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno)  /*   * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the approciate error response. + * invokes the appropriate error response.   */  __cold  void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, @@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function  	 * although there is no way to update the progress. It would add the  	 * risk of a deadlock, therefore the canceling is omitted. The only  	 * penalty is that some I/O remains active until the procedure -	 * completes. The next time when the filesystem is mounted writeable +	 * completes. The next time when the filesystem is mounted writable  	 * again, the device replace operation continues.  	 */  } @@ -1848,7 +1848,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		if (!btrfs_check_rw_degradable(fs_info, NULL)) {  			btrfs_warn(fs_info, -				"too many missing devices, writeable remount is not allowed"); +		"too many missing devices, writable remount is not allowed");  			ret = -EACCES;  			goto restore;  		} @@ -2090,7 +2090,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)  	u64 total_free_data = 0;  	u64 total_free_meta = 0;  	int bits = dentry->d_sb->s_blocksize_bits; -	__be32 *fsid = (__be32 *)fs_info->fsid; +	__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;  	unsigned factor = 1;  	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;  	int ret; @@ -2312,7 +2312,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)  	 * device_list_mutex here as we only read the device data and the list  	 * is protected by RCU.  Even if a device is deleted during the list  	 * traversals, we'll get valid data, the freeing callback will wait at -	 * least until until the rcu_read_unlock. +	 * least until the rcu_read_unlock.  	 */  	rcu_read_lock();  	cur_devices = fs_info->fs_devices; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..5a5930e3d32b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);  BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);  BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);  BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);  BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);  static struct attribute *btrfs_supported_feature_attrs[] = { @@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {  	BTRFS_FEAT_ATTR_PTR(raid56),  	BTRFS_FEAT_ATTR_PTR(skinny_metadata),  	BTRFS_FEAT_ATTR_PTR(no_holes), +	BTRFS_FEAT_ATTR_PTR(metadata_uuid),  	BTRFS_FEAT_ATTR_PTR(free_space_tree),  	NULL  }; @@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj,  BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, +				struct kobj_attribute *a, char *buf) +{ +	struct btrfs_fs_info *fs_info = to_fs_info(kobj); + +	return snprintf(buf, PAGE_SIZE, "%pU\n", +			fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); +  static const struct attribute *btrfs_attrs[] = {  	BTRFS_ATTR_PTR(, label),  	BTRFS_ATTR_PTR(, nodesize),  	BTRFS_ATTR_PTR(, sectorsize),  	BTRFS_ATTR_PTR(, clone_alignment),  	BTRFS_ATTR_PTR(, quota_override), +	BTRFS_ATTR_PTR(, metadata_uuid),  	NULL,  }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c6ee600aff89..40716b357c1d 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@  extern u64 btrfs_debugfs_test;  enum btrfs_feature_set { -	FEAT_COMPAT = 0, +	FEAT_COMPAT,  	FEAT_COMPAT_RO,  	FEAT_INCOMPAT,  	FEAT_MAX diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index db72b3b6209e..8a59597f1883 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root)  	/* Will be freed by btrfs_free_fs_roots */  	if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))  		return; -	if (root->node) +	if (root->node) { +		/* One for allocate_extent_buffer */  		free_extent_buffer(root->node); +	}  	kfree(root);  } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..3c46d7f23456 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize)  	struct page *page;  	struct page *locked_page = NULL;  	unsigned long index = 0; -	u64 total_dirty = SZ_256M; -	u64 max_bytes = SZ_128M; +	/* In this test we need at least 2 file extents at its maximum size */ +	u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; +	u64 total_dirty = 2 * max_bytes;  	u64 start, end, test_start; -	u64 found; +	bool found;  	int ret = -EINVAL;  	test_msg("running find delalloc tests"); @@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize)  		return -ENOMEM;  	} -	extent_io_tree_init(&tmp, inode); +	extent_io_tree_init(&tmp, NULL);  	/*  	 * First go through and create and mark all of our pages dirty, we pin @@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize)  	set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);  	start = 0;  	end = 0; -	found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, -					 &end, max_bytes); +	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, +					 &end);  	if (!found) {  		test_err("should have found at least one delalloc");  		goto out_bits; @@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize)  	set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);  	start = test_start;  	end = 0; -	found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, -					 &end, max_bytes); +	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, +					 &end);  	if (!found) {  		test_err("couldn't find delalloc in our range");  		goto out_bits; @@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize)  	}  	start = test_start;  	end = 0; -	found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, -					 &end, max_bytes); +	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, +					 &end);  	if (found) {  		test_err("found range when we shouldn't have");  		goto out_bits; @@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize)  	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);  	start = test_start;  	end = 0; -	found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, -					 &end, max_bytes); +	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, +					 &end);  	if (!found) {  		test_err("didn't find our range");  		goto out_bits; @@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize)  	 * this changes at any point in the future we will need to fix this  	 * tests expected behavior.  	 */ -	found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, -					 &end, max_bytes); +	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, +					 &end);  	if (!found) {  		test_err("didn't find our range");  		goto out_bits; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 64043f028820..af0c8e30d9e2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)  		goto out;  	} -	/* -	 * We will just free a dummy node if it's ref count is 2 so we need an -	 * extra ref so our searches don't accidentally release our page. -	 */ -	extent_buffer_get(root->node);  	btrfs_set_header_nritems(root->node, 0);  	btrfs_set_header_level(root->node, 0);  	ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)  		goto out;  	} -	extent_buffer_get(root->node);  	btrfs_set_header_nritems(root->node, 0);  	btrfs_set_header_level(root->node, 0);  	BTRFS_I(inode)->root = root; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1eeef9ec5da..127fa1535f58 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -233,14 +233,12 @@ loop:  	extwriter_counter_init(cur_trans, type);  	init_waitqueue_head(&cur_trans->writer_wait);  	init_waitqueue_head(&cur_trans->commit_wait); -	init_waitqueue_head(&cur_trans->pending_wait);  	cur_trans->state = TRANS_STATE_RUNNING;  	/*  	 * One for this trans handle, one so it will live on until we  	 * commit the transaction.  	 */  	refcount_set(&cur_trans->use_count, 2); -	atomic_set(&cur_trans->pending_ordered, 0);  	cur_trans->flags = 0;  	cur_trans->start_time = ktime_get_seconds(); @@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,  		  bool enforce_qgroups)  {  	struct btrfs_fs_info *fs_info = root->fs_info; - +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;  	struct btrfs_trans_handle *h;  	struct btrfs_transaction *cur_trans;  	u64 num_bytes = 0; @@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,  	 * the appropriate flushing if need be.  	 */  	if (num_items && root != fs_info->chunk_root) { +		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; +		u64 delayed_refs_bytes = 0; +  		qgroup_reserved = num_items * fs_info->nodesize;  		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,  				enforce_qgroups);  		if (ret)  			return ERR_PTR(ret); +		/* +		 * We want to reserve all the bytes we may need all at once, so +		 * we only do 1 enospc flushing cycle per transaction start.  We +		 * accomplish this by simply assuming we'll do 2 x num_items +		 * worth of delayed refs updates in this trans handle, and +		 * refill that amount for whatever is missing in the reserve. +		 */  		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); +		if (delayed_refs_rsv->full == 0) { +			delayed_refs_bytes = num_bytes; +			num_bytes <<= 1; +		} +  		/*  		 * Do the reservation for the relocation root creation  		 */ @@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,  			reloc_reserved = true;  		} -		ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, -					  num_bytes, flush); +		ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); +		if (ret) +			goto reserve_fail; +		if (delayed_refs_bytes) { +			btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, +							  delayed_refs_bytes); +			num_bytes -= delayed_refs_bytes; +		} +	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && +		   !delayed_refs_rsv->full) { +		/* +		 * Some people call with btrfs_start_transaction(root, 0) +		 * because they can be throttled, but have some other mechanism +		 * for reserving space.  We still want these guys to refill the +		 * delayed block_rsv so just add 1 items worth of reservation +		 * here. +		 */ +		ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);  		if (ret)  			goto reserve_fail;  	} @@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)  /*   * btrfs_attach_transaction_barrier() - catch the running transaction   * - * It is similar to the above function, the differentia is this one + * It is similar to the above function, the difference is this one   * will wait for all the inactive transactions until they fully   * complete.   */ @@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)  {  	struct btrfs_fs_info *fs_info = trans->fs_info; -	if (btrfs_check_space_for_delayed_refs(trans)) +	if (btrfs_check_space_for_delayed_refs(fs_info))  		return 1;  	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)  int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)  {  	struct btrfs_transaction *cur_trans = trans->transaction; -	int updates; -	int err;  	smp_mb();  	if (cur_trans->state >= TRANS_STATE_BLOCKED ||  	    cur_trans->delayed_refs.flushing)  		return 1; -	updates = trans->delayed_ref_updates; -	trans->delayed_ref_updates = 0; -	if (updates) { -		err = btrfs_run_delayed_refs(trans, updates * 2); -		if (err) /* Error code will also eval true */ -			return err; -	} -  	return should_end_transaction(trans);  } @@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  {  	struct btrfs_fs_info *info = trans->fs_info;  	struct btrfs_transaction *cur_trans = trans->transaction; -	u64 transid = trans->transid; -	unsigned long cur = trans->delayed_ref_updates;  	int lock = (trans->type != TRANS_JOIN_NOLOCK);  	int err = 0; -	int must_run_delayed_refs = 0;  	if (refcount_read(&trans->use_count) > 1) {  		refcount_dec(&trans->use_count); @@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	if (!list_empty(&trans->new_bgs))  		btrfs_create_pending_block_groups(trans); -	trans->delayed_ref_updates = 0; -	if (!trans->sync) { -		must_run_delayed_refs = -			btrfs_should_throttle_delayed_refs(trans); -		cur = max_t(unsigned long, cur, 32); - -		/* -		 * don't make the caller wait if they are from a NOLOCK -		 * or ATTACH transaction, it will deadlock with commit -		 */ -		if (must_run_delayed_refs == 1 && -		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) -			must_run_delayed_refs = 2; -	} - -	btrfs_trans_release_metadata(trans); -	trans->block_rsv = NULL; - -	if (!list_empty(&trans->new_bgs)) -		btrfs_create_pending_block_groups(trans); -  	btrfs_trans_release_chunk_metadata(trans);  	if (lock && should_end_transaction(trans) && @@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	}  	kmem_cache_free(btrfs_trans_handle_cachep, trans); -	if (must_run_delayed_refs) { -		btrfs_async_run_delayed_refs(info, cur, transid, -					     must_run_delayed_refs == 1); -	}  	return err;  } @@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,  		return 0;  	/* -	 * Ensure dirty @src will be commited.  Or, after comming +	 * Ensure dirty @src will be committed.  Or, after coming  	 * commit_fs_roots() and switch_commit_roots(), any dirty but not  	 * recorded root will never be updated again, causing an outdated root  	 * item. @@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)  {  	struct btrfs_fs_info *fs_info = trans->fs_info;  	struct btrfs_transaction *cur_trans = trans->transaction; -	DEFINE_WAIT(wait);  	WARN_ON(refcount_read(&trans->use_count) > 1); @@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)  		btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);  } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ -	wait_event(cur_trans->pending_wait, -		   atomic_read(&cur_trans->pending_ordered) == 0); -} -  int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  {  	struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	btrfs_wait_delalloc_flush(fs_info); -	btrfs_wait_pending_ordered(cur_trans); -  	btrfs_scrub_pause(fs_info);  	/*  	 * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 4cbb1b55387d..f1ba78949d1b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,13 +12,13 @@  #include "ctree.h"  enum btrfs_trans_state { -	TRANS_STATE_RUNNING		= 0, -	TRANS_STATE_BLOCKED		= 1, -	TRANS_STATE_COMMIT_START	= 2, -	TRANS_STATE_COMMIT_DOING	= 3, -	TRANS_STATE_UNBLOCKED		= 4, -	TRANS_STATE_COMPLETED		= 5, -	TRANS_STATE_MAX			= 6, +	TRANS_STATE_RUNNING, +	TRANS_STATE_BLOCKED, +	TRANS_STATE_COMMIT_START, +	TRANS_STATE_COMMIT_DOING, +	TRANS_STATE_UNBLOCKED, +	TRANS_STATE_COMPLETED, +	TRANS_STATE_MAX,  };  #define BTRFS_TRANS_HAVE_FREE_BGS	0 @@ -39,7 +39,6 @@ struct btrfs_transaction {  	 */  	atomic_t num_writers;  	refcount_t use_count; -	atomic_t pending_ordered;  	unsigned long flags; @@ -51,7 +50,6 @@ struct btrfs_transaction {  	time64_t start_time;  	wait_queue_head_t writer_wait;  	wait_queue_head_t commit_wait; -	wait_queue_head_t pending_wait;  	struct list_head pending_snapshots;  	struct list_head pending_chunks;  	struct list_head switch_commits; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1a4e2b101ef2..a62e1e837a89 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,10 +27,10 @@   *   * @type:	leaf or node   * @identifier:	the necessary info to locate the leaf/node. - * 		It's recommened to decode key.objecitd/offset if it's + * 		It's recommended to decode key.objecitd/offset if it's   * 		meaningful.   * @reason:	describe the error - * @bad_value:	optional, it's recommened to output bad value and its + * @bad_value:	optional, it's recommended to output bad value and its   *		expected value (range).   *   * Since comma is used to separate the components, only space is allowed @@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,  	}  	/* -	 * Support for new compression/encrption must introduce incompat flag, +	 * Support for new compression/encryption must introduce incompat flag,  	 * and must be caught in open_ctree().  	 */  	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5ce99a6c936..ac232b3d6d7e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1144,7 +1144,7 @@ next:  	}  	btrfs_release_path(path); -	/* look for a conflicing name */ +	/* look for a conflicting name */  	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),  				   name, namelen, 0);  	if (di && !IS_ERR(di)) { @@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	mutex_unlock(&log_root_tree->log_mutex);  	/* -	 * nobody else is going to jump in and write the the ctree +	 * Nobody else is going to jump in and write the ctree  	 * super here because the log_commit atomic below is protecting  	 * us.  We must be called with a transaction handle pinning  	 * the running transaction open, so a full commit can't hop @@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  			  struct btrfs_root *log)  {  	int ret; -	u64 start; -	u64 end;  	struct walk_control wc = {  		.free = 1,  		.process_func = process_one_buffer @@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  			btrfs_handle_fs_error(log->fs_info, ret, NULL);  	} -	while (1) { -		ret = find_first_extent_bit(&log->dirty_log_pages, -				0, &start, &end, -				EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, -				NULL); -		if (ret) -			break; - -		clear_extent_bits(&log->dirty_log_pages, start, end, -				  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); -	} - +	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, +			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);  	free_extent_buffer(log->node);  	kfree(log);  } @@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  	struct extent_map *em, *n;  	struct list_head extents;  	struct extent_map_tree *tree = &inode->extent_tree; -	u64 logged_start, logged_end;  	u64 test_gen;  	int ret = 0;  	int num = 0; @@ -4392,8 +4379,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  	write_lock(&tree->lock);  	test_gen = root->fs_info->last_trans_committed; -	logged_start = start; -	logged_end = end;  	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {  		/* @@ -4434,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  		    em->start >= i_size_read(&inode->vfs_inode))  			continue; -		if (em->start < logged_start) -			logged_start = em->start; -		if ((em->start + em->len - 1) > logged_end) -			logged_end = em->start + em->len - 1; -  		/* Need a ref to keep it from getting evicted from cache */  		refcount_inc(&em->refs);  		set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5778,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  			goto end_trans;  	} +	/* +	 * If a new hard link was added to the inode in the current transaction +	 * and its link count is now greater than 1, we need to fallback to a +	 * transaction commit, otherwise we can end up not logging all its new +	 * parents for all the hard links. Here just from the dentry used to +	 * fsync, we can not visit the ancestor inodes for all the other hard +	 * links to figure out if any is new, so we fallback to a transaction +	 * commit (instead of adding a lot of complexity of scanning a btree, +	 * since this scenario is not a common use case). +	 */ +	if (inode->vfs_inode.i_nlink > 1 && +	    inode->last_link_trans > last_committed) { +		ret = -EMLINK; +		goto end_trans; +	} +  	while (1) {  		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)  			break; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 767765031e59..0fab84a8f670 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -15,7 +15,6 @@  struct btrfs_log_ctx {  	int log_ret;  	int log_transid; -	int io_err;  	bool log_new_dentries;  	struct inode *inode;  	struct list_head list; @@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,  {  	ctx->log_ret = 0;  	ctx->log_transid = 0; -	ctx->io_err = 0;  	ctx->log_new_dentries = false;  	ctx->inode = inode;  	INIT_LIST_HEAD(&ctx->list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..2576b1a379c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 1,  		.devs_increment	= 2,  		.ncopies	= 2, +		.nparity        = 0,  		.raid_name	= "raid10",  		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,  		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 1,  		.devs_increment	= 2,  		.ncopies	= 2, +		.nparity        = 0,  		.raid_name	= "raid1",  		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,  		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 2, +		.nparity        = 0,  		.raid_name	= "dup",  		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,  		.mindev_error	= 0, @@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 1, +		.nparity        = 0,  		.raid_name	= "raid0",  		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,  		.mindev_error	= 0, @@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 1, +		.nparity        = 0,  		.raid_name	= "single",  		.bg_flag	= 0,  		.mindev_error	= 0, @@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.devs_min	= 2,  		.tolerated_failures = 1,  		.devs_increment	= 1, -		.ncopies	= 2, +		.ncopies	= 1, +		.nparity        = 1,  		.raid_name	= "raid5",  		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,  		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.devs_min	= 3,  		.tolerated_failures = 2,  		.devs_increment	= 1, -		.ncopies	= 3, +		.ncopies	= 1, +		.nparity        = 2,  		.raid_name	= "raid6",  		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,  		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, @@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type)  	return btrfs_raid_array[type].raid_name;  } +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ +	int i; +	int ret; +	char *bp = buf; +	u64 flags = bg_flags; +	u32 size_bp = size_buf; + +	if (!flags) { +		strcpy(bp, "NONE"); +		return; +	} + +#define DESCRIBE_FLAG(flag, desc)						\ +	do {								\ +		if (flags & (flag)) {					\ +			ret = snprintf(bp, size_bp, "%s|", (desc));	\ +			if (ret < 0 || ret >= size_bp)			\ +				goto out_overflow;			\ +			size_bp -= ret;					\ +			bp += ret;					\ +			flags &= ~(flag);				\ +		}							\ +	} while (0) + +	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); +	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); +	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + +	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); +	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) +		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, +			      btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + +	if (flags) { +		ret = snprintf(bp, size_bp, "0x%llx|", flags); +		size_bp -= ret; +	} + +	if (size_bp < size_buf) +		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + +	/* +	 * The text is trimmed, it's up to the caller to provide sufficiently +	 * large buffer +	 */ +out_overflow:; +} +  static int init_first_rw_device(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info);  static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); @@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,   * the mutex can be very coarse and can cover long-running operations   *   * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time   *   * global::fs_devs - add, remove, updates to the global list   * @@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void)  /*   * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid:	if not NULL, copy the uuid to fs_devices::fsid + * @fsid:		if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid   *   * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().   * The returned struct is not linked onto any lists and can be destroyed with   * kfree() right away.   */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, +						 const u8 *metadata_fsid)  {  	struct btrfs_fs_devices *fs_devs; @@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)  	if (fsid)  		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); +	if (metadata_fsid) +		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); +	else if (fsid) +		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); +  	return fs_devs;  } @@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,  	return NULL;  } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( +		const u8 *fsid, const u8 *metadata_fsid)  {  	struct btrfs_fs_devices *fs_devices; +	ASSERT(fsid); + +	if (metadata_fsid) { +		/* +		 * Handle scanned device having completed its fsid change but +		 * belonging to a fs_devices that was created by first scanning +		 * a device which didn't have its fsid/metadata_uuid changed +		 * at all and the CHANGING_FSID_V2 flag set. +		 */ +		list_for_each_entry(fs_devices, &fs_uuids, fs_list) { +			if (fs_devices->fsid_change && +			    memcmp(metadata_fsid, fs_devices->fsid, +				   BTRFS_FSID_SIZE) == 0 && +			    memcmp(fs_devices->fsid, fs_devices->metadata_uuid, +				   BTRFS_FSID_SIZE) == 0) { +				return fs_devices; +			} +		} +		/* +		 * Handle scanned device having completed its fsid change but +		 * belonging to a fs_devices that was created by a device that +		 * has an outdated pair of fsid/metadata_uuid and +		 * CHANGING_FSID_V2 flag set. +		 */ +		list_for_each_entry(fs_devices, &fs_uuids, fs_list) { +			if (fs_devices->fsid_change && +			    memcmp(fs_devices->metadata_uuid, +				   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && +			    memcmp(metadata_fsid, fs_devices->metadata_uuid, +				   BTRFS_FSID_SIZE) == 0) { +				return fs_devices; +			} +		} +	} + +	/* Handle non-split brain cases */  	list_for_each_entry(fs_devices, &fs_uuids, fs_list) { -		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) -			return fs_devices; +		if (metadata_fsid) { +			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 +			    && memcmp(metadata_fsid, fs_devices->metadata_uuid, +				      BTRFS_FSID_SIZE) == 0) +				return fs_devices; +		} else { +			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) +				return fs_devices; +		}  	}  	return NULL;  } @@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,  	device->generation = btrfs_super_generation(disk_super);  	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { +		if (btrfs_super_incompat_flags(disk_super) & +		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { +			pr_err( +		"BTRFS: Invalid seeding and uuid-changed device detected\n"); +			goto error_brelse; +		} +  		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);  		fs_devices->seeding = 1;  	} else { @@ -744,6 +863,51 @@ error_brelse:  }  /* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( +					struct btrfs_super_block *disk_super) +{ +	struct btrfs_fs_devices *fs_devices; + +	list_for_each_entry(fs_devices, &fs_uuids, fs_list) { +		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, +			   BTRFS_FSID_SIZE) != 0 && +		    memcmp(fs_devices->metadata_uuid, disk_super->fsid, +			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { +			return fs_devices; +		} +	} + +	return NULL; +} + + +static struct btrfs_fs_devices *find_fsid_changed( +					struct btrfs_super_block *disk_super) +{ +	struct btrfs_fs_devices *fs_devices; + +	/* +	 * Handles the case where scanned device is part of an fs that had +	 * multiple successful changes of FSID but curently device didn't +	 * observe it. Meaning our fsid will be different than theirs. +	 */ +	list_for_each_entry(fs_devices, &fs_uuids, fs_list) { +		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, +			   BTRFS_FSID_SIZE) != 0 && +		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, +			   BTRFS_FSID_SIZE) == 0 && +		    memcmp(fs_devices->fsid, disk_super->fsid, +			   BTRFS_FSID_SIZE) != 0) { +			return fs_devices; +		} +	} + +	return NULL; +} +/*   * Add new device to list of registered devices   *   * Returns: @@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path,  			   bool *new_device_added)  {  	struct btrfs_device *device; -	struct btrfs_fs_devices *fs_devices; +	struct btrfs_fs_devices *fs_devices = NULL;  	struct rcu_string *name;  	u64 found_transid = btrfs_super_generation(disk_super);  	u64 devid = btrfs_stack_device_id(&disk_super->dev_item); +	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & +		BTRFS_FEATURE_INCOMPAT_METADATA_UUID); +	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & +					BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + +	if (fsid_change_in_progress) { +		if (!has_metadata_uuid) { +			/* +			 * When we have an image which has CHANGING_FSID_V2 set +			 * it might belong to either a filesystem which has +			 * disks with completed fsid change or it might belong +			 * to fs with no UUID changes in effect, handle both. +			 */ +			fs_devices = find_fsid_inprogress(disk_super); +			if (!fs_devices) +				fs_devices = find_fsid(disk_super->fsid, NULL); +		} else { +			fs_devices = find_fsid_changed(disk_super); +		} +	} else if (has_metadata_uuid) { +		fs_devices = find_fsid(disk_super->fsid, +				       disk_super->metadata_uuid); +	} else { +		fs_devices = find_fsid(disk_super->fsid, NULL); +	} + -	fs_devices = find_fsid(disk_super->fsid);  	if (!fs_devices) { -		fs_devices = alloc_fs_devices(disk_super->fsid); +		if (has_metadata_uuid) +			fs_devices = alloc_fs_devices(disk_super->fsid, +						      disk_super->metadata_uuid); +		else +			fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + +		fs_devices->fsid_change = fsid_change_in_progress; +  		if (IS_ERR(fs_devices))  			return ERR_CAST(fs_devices); @@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path,  		mutex_lock(&fs_devices->device_list_mutex);  		device = find_device(fs_devices, devid,  				disk_super->dev_item.uuid); + +		/* +		 * If this disk has been pulled into an fs devices created by +		 * a device which had the CHANGING_FSID_V2 flag then replace the +		 * metadata_uuid/fsid values of the fs_devices. +		 */ +		if (has_metadata_uuid && fs_devices->fsid_change && +		    found_transid > fs_devices->latest_generation) { +			memcpy(fs_devices->fsid, disk_super->fsid, +					BTRFS_FSID_SIZE); +			memcpy(fs_devices->metadata_uuid, +					disk_super->metadata_uuid, BTRFS_FSID_SIZE); + +			fs_devices->fsid_change = false; +		}  	}  	if (!device) { @@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path,  			return ERR_PTR(-EEXIST);  		} +		/* +		 * We are going to replace the device path for a given devid, +		 * make sure it's the same device if the device is mounted +		 */ +		if (device->bdev) { +			struct block_device *path_bdev; + +			path_bdev = lookup_bdev(path); +			if (IS_ERR(path_bdev)) { +				mutex_unlock(&fs_devices->device_list_mutex); +				return ERR_CAST(path_bdev); +			} + +			if (device->bdev != path_bdev) { +				bdput(path_bdev); +				mutex_unlock(&fs_devices->device_list_mutex); +				btrfs_warn_in_rcu(device->fs_info, +			"duplicate device fsid:devid for %pU:%llu old:%s new:%s", +					disk_super->fsid, devid, +					rcu_str_deref(device->name), path); +				return ERR_PTR(-EEXIST); +			} +			bdput(path_bdev); +			btrfs_info_in_rcu(device->fs_info, +				"device fsid %pU devid %llu moved old:%s new:%s", +				disk_super->fsid, devid, +				rcu_str_deref(device->name), path); +		} +  		name = rcu_string_strdup(path, GFP_NOFS);  		if (!name) {  			mutex_unlock(&fs_devices->device_list_mutex); @@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,  	 * it back. We need it to pick the disk with largest generation  	 * (as above).  	 */ -	if (!fs_devices->opened) +	if (!fs_devices->opened) {  		device->generation = found_transid; +		fs_devices->latest_generation = max_t(u64, found_transid, +						fs_devices->latest_generation); +	}  	fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)  	struct btrfs_device *device;  	struct btrfs_device *orig_dev; -	fs_devices = alloc_fs_devices(orig->fsid); +	fs_devices = alloc_fs_devices(orig->fsid, NULL);  	if (IS_ERR(fs_devices))  		return fs_devices; @@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,  	p = kmap(*page);  	/* align our pointer to the offset of the super block */ -	*disk_super = p + (bytenr & ~PAGE_MASK); +	*disk_super = p + offset_in_page(bytenr);  	if (btrfs_super_bytenr(*disk_super) != bytenr ||  	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { @@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,  	ptr = btrfs_device_uuid(dev_item);  	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);  	ptr = btrfs_device_fsid(dev_item); -	write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); +	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, +			    ptr, BTRFS_FSID_SIZE);  	btrfs_mark_buffer_dirty(leaf);  	ret = 0; @@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)  {  	u64 num_devices = fs_info->fs_devices->num_devices; -	btrfs_dev_replace_read_lock(&fs_info->dev_replace); +	down_read(&fs_info->dev_replace.rwsem);  	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {  		ASSERT(num_devices > 1);  		num_devices--;  	} -	btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +	up_read(&fs_info->dev_replace.rwsem);  	return num_devices;  } @@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  		goto out;  	} +	if (btrfs_pinned_by_swapfile(fs_info, device)) { +		btrfs_warn_in_rcu(fs_info, +		  "cannot remove device %s (devid %llu) due to active swapfile", +				  rcu_str_deref(device->name), device->devid); +		ret = -ETXTBSY; +		goto out; +	} +  	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {  		ret = BTRFS_ERROR_DEV_TGT_REPLACE;  		goto out; @@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path(  	disk_super = (struct btrfs_super_block *)bh->b_data;  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	dev_uuid = disk_super->dev_item.uuid; -	device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); +	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) +		device = btrfs_find_device(fs_info, devid, dev_uuid, +				disk_super->metadata_uuid); +	else +		device = btrfs_find_device(fs_info, devid, +				dev_uuid, disk_super->fsid); +  	brelse(bh);  	if (!device)  		device = ERR_PTR(-ENOENT); @@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)  	if (!fs_devices->seeding)  		return -EINVAL; -	seed_devices = alloc_fs_devices(NULL); +	seed_devices = alloc_fs_devices(NULL, NULL);  	if (IS_ERR(seed_devices))  		return PTR_ERR(seed_devices); @@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)  	fs_devices->seed = seed_devices;  	generate_random_uuid(fs_devices->fsid); -	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); +	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);  	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);  	mutex_unlock(&fs_devices->device_list_mutex); @@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path  		 * so rename the fsid on the sysfs  		 */  		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", -						fs_info->fsid); +						fs_info->fs_devices->fsid);  		if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))  			btrfs_warn(fs_info,  				   "sysfs: failed to create fsid for sprout"); @@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)  	return ret;  } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, -					u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, +				       u64 logical, u64 length)  {  	struct extent_map_tree *em_tree;  	struct extent_map *em; @@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)  	int i, ret = 0;  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; -	em = get_chunk_map(fs_info, chunk_offset, 1); +	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);  	if (IS_ERR(em)) {  		/*  		 * This is a logic error, but we don't want to just rely on the @@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)  			mutex_unlock(&fs_info->chunk_mutex);  		} -		if (map->stripes[i].dev) { -			ret = btrfs_update_device(trans, map->stripes[i].dev); -			if (ret) { -				mutex_unlock(&fs_devices->device_list_mutex); -				btrfs_abort_transaction(trans, ret); -				goto out; -			} +		ret = btrfs_update_device(trans, device); +		if (ret) { +			mutex_unlock(&fs_devices->device_list_mutex); +			btrfs_abort_transaction(trans, ret); +			goto out;  		}  	}  	mutex_unlock(&fs_devices->device_list_mutex); @@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  {  	struct btrfs_balance_control *bctl = fs_info->balance_ctl;  	struct btrfs_root *chunk_root = fs_info->chunk_root; -	struct btrfs_root *dev_root = fs_info->dev_root; -	struct list_head *devices; -	struct btrfs_device *device; -	u64 old_size; -	u64 size_to_free;  	u64 chunk_type;  	struct btrfs_chunk *chunk;  	struct btrfs_path *path = NULL;  	struct btrfs_key key;  	struct btrfs_key found_key; -	struct btrfs_trans_handle *trans;  	struct extent_buffer *leaf;  	int slot;  	int ret; @@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  	u32 count_sys = 0;  	int chunk_reserved = 0; -	/* step one make some room on all the devices */ -	devices = &fs_info->fs_devices->devices; -	list_for_each_entry(device, devices, dev_list) { -		old_size = btrfs_device_get_total_bytes(device); -		size_to_free = div_factor(old_size, 1); -		size_to_free = min_t(u64, size_to_free, SZ_1M); -		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || -		    btrfs_device_get_total_bytes(device) - -		    btrfs_device_get_bytes_used(device) > size_to_free || -		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) -			continue; - -		ret = btrfs_shrink_device(device, old_size - size_to_free); -		if (ret == -ENOSPC) -			break; -		if (ret) { -			/* btrfs_shrink_device never returns ret > 0 */ -			WARN_ON(ret > 0); -			goto error; -		} - -		trans = btrfs_start_transaction(dev_root, 0); -		if (IS_ERR(trans)) { -			ret = PTR_ERR(trans); -			btrfs_info_in_rcu(fs_info, -		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", -					  rcu_str_deref(device->name), ret, -					  old_size, old_size - size_to_free); -			goto error; -		} - -		ret = btrfs_grow_device(trans, device, old_size); -		if (ret) { -			btrfs_end_transaction(trans); -			/* btrfs_grow_device never returns ret > 0 */ -			WARN_ON(ret > 0); -			btrfs_info_in_rcu(fs_info, -		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", -					  rcu_str_deref(device->name), ret, -					  old_size, old_size - size_to_free); -			goto error; -		} - -		btrfs_end_transaction(trans); -	} - -	/* step two, relocate all the chunks */  	path = btrfs_alloc_path();  	if (!path) {  		ret = -ENOMEM; @@ -3638,10 +3848,15 @@ again:  		ret = btrfs_relocate_chunk(fs_info, found_key.offset);  		mutex_unlock(&fs_info->delete_unused_bgs_mutex); -		if (ret && ret != -ENOSPC) -			goto error;  		if (ret == -ENOSPC) {  			enospc_errors++; +		} else if (ret == -ETXTBSY) { +			btrfs_info(fs_info, +	   "skipping relocation of block group %llu due to active swapfile", +				   found_key.offset); +			ret = 0; +		} else if (ret) { +			goto error;  		} else {  			spin_lock(&fs_info->balance_lock);  			bctl->stat.completed++; @@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,  }  /* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, +				 u32 size_buf) +{ +	int ret; +	u32 size_bp = size_buf; +	char *bp = buf; +	u64 flags = bargs->flags; +	char tmp_buf[128] = {'\0'}; + +	if (!flags) +		return; + +#define CHECK_APPEND_NOARG(a)						\ +	do {								\ +		ret = snprintf(bp, size_bp, (a));			\ +		if (ret < 0 || ret >= size_bp)				\ +			goto out_overflow;				\ +		size_bp -= ret;						\ +		bp += ret;						\ +	} while (0) + +#define CHECK_APPEND_1ARG(a, v1)					\ +	do {								\ +		ret = snprintf(bp, size_bp, (a), (v1));			\ +		if (ret < 0 || ret >= size_bp)				\ +			goto out_overflow;				\ +		size_bp -= ret;						\ +		bp += ret;						\ +	} while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2)					\ +	do {								\ +		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\ +		if (ret < 0 || ret >= size_bp)				\ +			goto out_overflow;				\ +		size_bp -= ret;						\ +		bp += ret;						\ +	} while (0) + +	if (flags & BTRFS_BALANCE_ARGS_CONVERT) { +		int index = btrfs_bg_flags_to_raid_index(bargs->target); + +		CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); +	} + +	if (flags & BTRFS_BALANCE_ARGS_SOFT) +		CHECK_APPEND_NOARG("soft,"); + +	if (flags & BTRFS_BALANCE_ARGS_PROFILES) { +		btrfs_describe_block_groups(bargs->profiles, tmp_buf, +					    sizeof(tmp_buf)); +		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); +	} + +	if (flags & BTRFS_BALANCE_ARGS_USAGE) +		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + +	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) +		CHECK_APPEND_2ARG("usage=%u..%u,", +				  bargs->usage_min, bargs->usage_max); + +	if (flags & BTRFS_BALANCE_ARGS_DEVID) +		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + +	if (flags & BTRFS_BALANCE_ARGS_DRANGE) +		CHECK_APPEND_2ARG("drange=%llu..%llu,", +				  bargs->pstart, bargs->pend); + +	if (flags & BTRFS_BALANCE_ARGS_VRANGE) +		CHECK_APPEND_2ARG("vrange=%llu..%llu,", +				  bargs->vstart, bargs->vend); + +	if (flags & BTRFS_BALANCE_ARGS_LIMIT) +		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + +	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) +		CHECK_APPEND_2ARG("limit=%u..%u,", +				bargs->limit_min, bargs->limit_max); + +	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) +		CHECK_APPEND_2ARG("stripes=%u..%u,", +				  bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + +	if (size_bp < size_buf) +		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ +	else +		buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ +	u32 size_buf = 1024; +	char tmp_buf[192] = {'\0'}; +	char *buf; +	char *bp; +	u32 size_bp = size_buf; +	int ret; +	struct btrfs_balance_control *bctl = fs_info->balance_ctl; + +	buf = kzalloc(size_buf, GFP_KERNEL); +	if (!buf) +		return; + +	bp = buf; + +#define CHECK_APPEND_1ARG(a, v1)					\ +	do {								\ +		ret = snprintf(bp, size_bp, (a), (v1));			\ +		if (ret < 0 || ret >= size_bp)				\ +			goto out_overflow;				\ +		size_bp -= ret;						\ +		bp += ret;						\ +	} while (0) + +	if (bctl->flags & BTRFS_BALANCE_FORCE) +		CHECK_APPEND_1ARG("%s", "-f "); + +	if (bctl->flags & BTRFS_BALANCE_DATA) { +		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); +		CHECK_APPEND_1ARG("-d%s ", tmp_buf); +	} + +	if (bctl->flags & BTRFS_BALANCE_METADATA) { +		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); +		CHECK_APPEND_1ARG("-m%s ", tmp_buf); +	} + +	if (bctl->flags & BTRFS_BALANCE_SYSTEM) { +		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); +		CHECK_APPEND_1ARG("-s%s ", tmp_buf); +	} + +#undef CHECK_APPEND_1ARG + +out_overflow: + +	if (size_bp < size_buf) +		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ +	btrfs_info(fs_info, "balance: %s %s", +		   (bctl->flags & BTRFS_BALANCE_RESUME) ? +		   "resume" : "start", buf); + +	kfree(buf); +} + +/*   * Should be called with balance mutexe held   */  int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  	int ret;  	u64 num_devices;  	unsigned seq; +	bool reducing_integrity;  	if (btrfs_fs_closing(fs_info) ||  	    atomic_read(&fs_info->balance_pause_req) || @@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  		     !(bctl->sys.target & allowed)) ||  		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&  		     (fs_info->avail_metadata_alloc_bits & allowed) && -		     !(bctl->meta.target & allowed))) { -			if (bctl->flags & BTRFS_BALANCE_FORCE) { -				btrfs_info(fs_info, -				"balance: force reducing metadata integrity"); -			} else { -				btrfs_err(fs_info, -	"balance: reduces metadata integrity, use --force if you want this"); -				ret = -EINVAL; -				goto out; -			} -		} +		     !(bctl->meta.target & allowed))) +			reducing_integrity = true; +		else +			reducing_integrity = false; + +		/* if we're not converting, the target field is uninitialized */ +		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? +			bctl->meta.target : fs_info->avail_metadata_alloc_bits; +		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? +			bctl->data.target : fs_info->avail_data_alloc_bits;  	} while (read_seqretry(&fs_info->profiles_lock, seq)); -	/* if we're not converting, the target field is uninitialized */ -	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? -		bctl->meta.target : fs_info->avail_metadata_alloc_bits; -	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? -		bctl->data.target : fs_info->avail_data_alloc_bits; +	if (reducing_integrity) { +		if (bctl->flags & BTRFS_BALANCE_FORCE) { +			btrfs_info(fs_info, +				   "balance: force reducing metadata integrity"); +		} else { +			btrfs_err(fs_info, +	  "balance: reduces metadata integrity, use --force if you want this"); +			ret = -EINVAL; +			goto out; +		} +	} +  	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <  		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {  		int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));  	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); +	describe_balance_start_or_resume(fs_info);  	mutex_unlock(&fs_info->balance_mutex);  	ret = __btrfs_balance(fs_info);  	mutex_lock(&fs_info->balance_mutex); +	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) +		btrfs_info(fs_info, "balance: paused"); +	else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) +		btrfs_info(fs_info, "balance: canceled"); +	else +		btrfs_info(fs_info, "balance: ended with status: %d", ret); +  	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);  	if (bargs) { @@ -3887,10 +4273,8 @@ static int balance_kthread(void *data)  	int ret = 0;  	mutex_lock(&fs_info->balance_mutex); -	if (fs_info->balance_ctl) { -		btrfs_info(fs_info, "balance: resuming"); +	if (fs_info->balance_ctl)  		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); -	}  	mutex_unlock(&fs_info->balance_mutex);  	return ret; @@ -4433,10 +4817,16 @@ again:  		ret = btrfs_relocate_chunk(fs_info, chunk_offset);  		mutex_unlock(&fs_info->delete_unused_bgs_mutex); -		if (ret && ret != -ENOSPC) -			goto done; -		if (ret == -ENOSPC) +		if (ret == -ENOSPC) {  			failed++; +		} else if (ret) { +			if (ret == -ETXTBSY) { +				btrfs_warn(fs_info, +		   "could not shrink block group %llu due to active swapfile", +					   chunk_offset); +			} +			goto done; +		}  	} while (key.offset-- > 0);  	if (failed && !retried) { @@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	int devs_min;		/* min devs needed */  	int devs_increment;	/* ndevs has to be a multiple of this */  	int ncopies;		/* how many copies to data has */ +	int nparity;		/* number of stripes worth of bytes to +				   store parity information */  	int ret;  	u64 max_stripe_size;  	u64 max_chunk_size;  	u64 stripe_size; -	u64 num_bytes; +	u64 chunk_size;  	int ndevs;  	int i;  	int j; @@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	devs_min = btrfs_raid_array[index].devs_min;  	devs_increment = btrfs_raid_array[index].devs_increment;  	ncopies = btrfs_raid_array[index].ncopies; +	nparity = btrfs_raid_array[index].nparity;  	if (type & BTRFS_BLOCK_GROUP_DATA) {  		max_stripe_size = SZ_1G; @@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		BUG_ON(1);  	} -	/* we don't want a chunk larger than 10% of writeable space */ +	/* We don't want a chunk larger than 10% of writable space */  	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),  			     max_chunk_size); @@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	 * this will have to be fixed for RAID1 and RAID10 over  	 * more drives  	 */ -	data_stripes = num_stripes / ncopies; - -	if (type & BTRFS_BLOCK_GROUP_RAID5) -		data_stripes = num_stripes - 1; - -	if (type & BTRFS_BLOCK_GROUP_RAID6) -		data_stripes = num_stripes - 2; +	data_stripes = (num_stripes - nparity) / ncopies;  	/*  	 * Use the number of data stripes to figure out how big this chunk  	 * is really going to be in terms of logical address space, -	 * and compare that answer with the max chunk size +	 * and compare that answer with the max chunk size. If it's higher, +	 * we try to reduce stripe_size.  	 */  	if (stripe_size * data_stripes > max_chunk_size) { -		stripe_size = div_u64(max_chunk_size, data_stripes); - -		/* bump the answer up to a 16MB boundary */ -		stripe_size = round_up(stripe_size, SZ_16M); -  		/* -		 * But don't go higher than the limits we found while searching -		 * for free extents +		 * Reduce stripe_size, round it up to a 16MB boundary again and +		 * then use it, unless it ends up being even bigger than the +		 * previous value we had already.  		 */ -		stripe_size = min(devices_info[ndevs - 1].max_avail, +		stripe_size = min(round_up(div_u64(max_chunk_size, +						   data_stripes), SZ_16M),  				  stripe_size);  	} @@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	map->type = type;  	map->sub_stripes = sub_stripes; -	num_bytes = stripe_size * data_stripes; +	chunk_size = stripe_size * data_stripes; -	trace_btrfs_chunk_alloc(info, map, start, num_bytes); +	trace_btrfs_chunk_alloc(info, map, start, chunk_size);  	em = alloc_extent_map();  	if (!em) { @@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);  	em->map_lookup = map;  	em->start = start; -	em->len = num_bytes; +	em->len = chunk_size;  	em->block_start = 0;  	em->block_len = em->len;  	em->orig_block_len = stripe_size; @@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	refcount_inc(&em->refs);  	write_unlock(&em_tree->lock); -	ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); +	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);  	if (ret)  		goto error_del_extent; -	for (i = 0; i < map->num_stripes; i++) { -		num_bytes = map->stripes[i].dev->bytes_used + stripe_size; -		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); -	} +	for (i = 0; i < map->num_stripes; i++) +		btrfs_device_set_bytes_used(map->stripes[i].dev, +				map->stripes[i].dev->bytes_used + stripe_size);  	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  	int i = 0;  	int ret = 0; -	em = get_chunk_map(fs_info, chunk_offset, chunk_size); +	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);  	if (IS_ERR(em))  		return PTR_ERR(em); @@ -4971,10 +5355,10 @@ out:  }  /* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the   * bootstrap process of adding storage to a seed btrfs.   */  int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) @@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)  	int miss_ndevs = 0;  	int i; -	em = get_chunk_map(fs_info, chunk_offset, 1); +	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);  	if (IS_ERR(em))  		return 1; @@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  	struct map_lookup *map;  	int ret; -	em = get_chunk_map(fs_info, logical, len); +	em = btrfs_get_chunk_map(fs_info, logical, len);  	if (IS_ERR(em))  		/*  		 * We could return errors for these cases, but that could get @@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  		ret = 1;  	free_extent_map(em); -	btrfs_dev_replace_read_lock(&fs_info->dev_replace); +	down_read(&fs_info->dev_replace.rwsem);  	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&  	    fs_info->dev_replace.tgtdev)  		ret++; -	btrfs_dev_replace_read_unlock(&fs_info->dev_replace); +	up_read(&fs_info->dev_replace.rwsem);  	return ret;  } @@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,  	struct map_lookup *map;  	unsigned long len = fs_info->sectorsize; -	em = get_chunk_map(fs_info, logical, len); +	em = btrfs_get_chunk_map(fs_info, logical, len);  	if (!WARN_ON(IS_ERR(em))) {  		map = em->map_lookup; @@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  	struct map_lookup *map;  	int ret = 0; -	em = get_chunk_map(fs_info, logical, len); +	em = btrfs_get_chunk_map(fs_info, logical, len);  	if(!WARN_ON(IS_ERR(em))) {  		map = em->map_lookup; @@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,  	/* discard always return a bbio */  	ASSERT(bbio_ret); -	em = get_chunk_map(fs_info, logical, length); +	em = btrfs_get_chunk_map(fs_info, logical, length);  	if (IS_ERR(em))  		return PTR_ERR(em); @@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,  		return __btrfs_map_block_for_discard(fs_info, logical,  						     *length, bbio_ret); -	em = get_chunk_map(fs_info, logical, *length); +	em = btrfs_get_chunk_map(fs_info, logical, *length);  	if (IS_ERR(em))  		return PTR_ERR(em); @@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,  		*length = em->len - offset;  	} -	/* This is for when we're called from btrfs_merge_bio_hook() and all -	   it cares about is the length */ +	/* +	 * This is for when we're called from btrfs_bio_fits_in_stripe and all +	 * it cares about is the length +	 */  	if (!bbio_ret)  		goto out; -	btrfs_dev_replace_read_lock(dev_replace); +	down_read(&dev_replace->rwsem);  	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); +	/* +	 * Hold the semaphore for read during the whole operation, write is +	 * requested at commit time but must wait. +	 */  	if (!dev_replace_is_ongoing) -		btrfs_dev_replace_read_unlock(dev_replace); -	else -		btrfs_dev_replace_set_lock_blocking(dev_replace); +		up_read(&dev_replace->rwsem);  	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&  	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,  	}  out:  	if (dev_replace_is_ongoing) { -		ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); -		btrfs_dev_replace_read_lock(dev_replace); -		/* Barrier implied by atomic_dec_and_test */ -		if (atomic_dec_and_test(&dev_replace->blocking_readers)) -			cond_wake_up_nomb(&dev_replace->read_lock_wq); -		btrfs_dev_replace_read_unlock(dev_replace); +		lockdep_assert_held(&dev_replace->rwsem); +		/* Unlock and let waiting writers proceed */ +		up_read(&dev_replace->rwsem);  	}  	free_extent_map(em);  	return ret; @@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,  	u64 rmap_len;  	int i, j, nr = 0; -	em = get_chunk_map(fs_info, chunk_start, 1); +	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);  	if (IS_ERR(em))  		return -EIO; @@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,  	int should_queue = 1;  	struct btrfs_pending_bios *pending_bios; -	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || -	    !device->bdev) { -		bio_io_error(bio); -		return; -	} -  	/* don't bother with additional async steps for reads, right now */  	if (bio_op(bio) == REQ_OP_READ) {  		btrfsic_submit_bio(bio); @@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,  	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {  		dev = bbio->stripes[dev_nr].dev; -		if (!dev || !dev->bdev || +		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, +						   &dev->dev_state) ||  		    (bio_op(first_bio) == REQ_OP_WRITE &&  		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {  			bbio_error(bbio, first_bio, logical); @@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,  	cur_devices = fs_info->fs_devices;  	while (cur_devices) {  		if (!fsid || -		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { +		    !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {  			device = find_device(cur_devices, devid, uuid);  			if (device)  				return device; @@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,  		fs_devices = fs_devices->seed;  	} -	fs_devices = find_fsid(fsid); +	fs_devices = find_fsid(fsid, NULL);  	if (!fs_devices) {  		if (!btrfs_test_opt(fs_info, DEGRADED))  			return ERR_PTR(-ENOENT); -		fs_devices = alloc_fs_devices(fsid); +		fs_devices = alloc_fs_devices(fsid, NULL);  		if (IS_ERR(fs_devices))  			return fs_devices; @@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,  	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),  			   BTRFS_FSID_SIZE); -	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { +	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {  		fs_devices = open_seed_devices(fs_info, fs_uuid);  		if (IS_ERR(fs_devices))  			return PTR_ERR(fs_devices); @@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,  		if (missing > max_tolerated) {  			if (!failing_dev)  				btrfs_warn(fs_info, -	"chunk %llu missing %d devices, max tolerance is %d for writeable mount", +	"chunk %llu missing %d devices, max tolerance is %d for writable mount",  				   em->start, missing, max_tolerated);  			free_extent_map(em);  			ret = false; @@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,  	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;  	struct extent_map *em;  	struct map_lookup *map; +	struct btrfs_device *dev;  	u64 stripe_len;  	bool found = false;  	int ret = 0; @@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,  			physical_offset, devid);  		ret = -EUCLEAN;  	} + +	/* Make sure no dev extent is beyond device bondary */ +	dev = btrfs_find_device(fs_info, devid, NULL, NULL); +	if (!dev) { +		btrfs_err(fs_info, "failed to find devid %llu", devid); +		ret = -EUCLEAN; +		goto out; +	} +	if (physical_offset + physical_len > dev->disk_total_bytes) { +		btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", +			  devid, physical_offset, physical_len, +			  dev->disk_total_bytes); +		ret = -EUCLEAN; +		goto out; +	}  out:  	free_extent_map(em);  	return ret; @@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)  	struct btrfs_path *path;  	struct btrfs_root *root = fs_info->dev_root;  	struct btrfs_key key; +	u64 prev_devid = 0; +	u64 prev_dev_ext_end = 0;  	int ret = 0;  	key.objectid = 1; @@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)  		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);  		physical_len = btrfs_dev_extent_length(leaf, dext); +		/* Check if this dev extent overlaps with the previous one */ +		if (devid == prev_devid && physical_offset < prev_dev_ext_end) { +			btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", +				  devid, physical_offset, prev_dev_ext_end); +			ret = -EUCLEAN; +			goto out; +		} +  		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,  					    physical_offset, physical_len);  		if (ret < 0)  			goto out; +		prev_devid = devid; +		prev_dev_ext_end = physical_offset + physical_len; +  		ret = btrfs_next_item(root, path);  		if (ret < 0)  			goto out; @@ -7541,3 +7952,27 @@ out:  	btrfs_free_path(path);  	return ret;  } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ +	struct btrfs_swapfile_pin *sp; +	struct rb_node *node; + +	spin_lock(&fs_info->swapfile_pins_lock); +	node = fs_info->swapfile_pins.rb_node; +	while (node) { +		sp = rb_entry(node, struct btrfs_swapfile_pin, node); +		if (ptr < sp->ptr) +			node = node->rb_left; +		else if (ptr > sp->ptr) +			node = node->rb_right; +		else +			break; +	} +	spin_unlock(&fs_info->swapfile_pins_lock); +	return node != NULL; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..ed806649a473 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);  struct btrfs_fs_devices {  	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ +	u8 metadata_uuid[BTRFS_FSID_SIZE]; +	bool fsid_change;  	struct list_head fs_list;  	u64 num_devices; @@ -218,6 +220,10 @@ struct btrfs_fs_devices {  	u64 missing_devices;  	u64 total_rw_bytes;  	u64 total_devices; + +	/* Highest generation number of seen devices */ +	u64 latest_generation; +  	struct block_device *latest_bdev;  	/* all of the devices in the FS, protected by a mutex @@ -261,15 +267,12 @@ struct btrfs_fs_devices {   * we allocate are actually btrfs_io_bios.  We'll cram as much of   * struct btrfs_bio as we can into this over time.   */ -typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);  struct btrfs_io_bio {  	unsigned int mirror_num;  	unsigned int stripe_index;  	u64 logical;  	u8 *csum;  	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; -	u8 *csum_allocated; -	btrfs_io_bio_end_io_t *end_io;  	struct bvec_iter iter;  	/*  	 * This member must come last, bio_alloc_bioset will allocate enough @@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)  	return container_of(bio, struct btrfs_io_bio, bio);  } +static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +{ +	if (io_bio->csum != io_bio->csum_inline) { +		kfree(io_bio->csum); +		io_bio->csum = NULL; +	} +} +  struct btrfs_bio_stripe {  	struct btrfs_device *dev;  	u64 physical;  	u64 length; /* only used for discard mappings */  }; -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -  struct btrfs_bio {  	refcount_t refs;  	atomic_t stripes_pending; @@ -331,6 +339,8 @@ struct btrfs_raid_attr {  	int tolerated_failures; /* max tolerated fail devs */  	int devs_increment;	/* ndevs has to be a multiple of this */  	int ncopies;		/* how many copies to data has */ +	int nparity;		/* number of stripes worth of bytes to store +				 * parity information */  	int mindev_error;	/* error code if min devs requisite is unmet */  	const char raid_name[8]; /* name of the raid */  	u64 bg_flag;		/* block group flag of the raid */ @@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);  int btrfs_balance(struct btrfs_fs_info *fs_info,  		  struct btrfs_balance_control *bctl,  		  struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);  int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);  int btrfs_recover_balance(struct btrfs_fs_info *fs_info);  int btrfs_pause_balance(struct btrfs_fs_info *fs_info); @@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,  int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  			     u64 chunk_offset, u64 chunk_size);  int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, +				       u64 logical, u64 length);  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,  				      int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@  #include <linux/security.h>  #include <linux/posix_acl_xattr.h>  #include <linux/iversion.h> +#include <linux/sched/mm.h>  #include "ctree.h"  #include "btrfs_inode.h"  #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode,  {  	const struct xattr *xattr;  	struct btrfs_trans_handle *trans = fs_info; +	unsigned int nofs_flag;  	char *name;  	int err = 0; +	/* +	 * We're holding a transaction handle, so use a NOFS memory allocation +	 * context to avoid deadlock if reclaim happens. +	 */ +	nofs_flag = memalloc_nofs_save();  	for (xattr = xattr_array; xattr->name != NULL; xattr++) {  		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +  			       strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode,  		if (err < 0)  			break;  	} +	memalloc_nofs_restore(nofs_flag);  	return err;  }  | 
