diff options
Diffstat (limited to 'fs/btrfs/ctree.c')
| -rw-r--r-- | fs/btrfs/ctree.c | 835 | 
1 files changed, 706 insertions, 129 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 15cbc2bf4ff0..9d7621f271ff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -321,7 +321,7 @@ struct tree_mod_root {  struct tree_mod_elem {  	struct rb_node node;  	u64 index;		/* shifted logical */ -	struct seq_list elem; +	u64 seq;  	enum mod_log_op op;  	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ @@ -341,20 +341,50 @@ struct tree_mod_elem {  	struct tree_mod_root old_root;  }; -static inline void -__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)  { -	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); -	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +	read_lock(&fs_info->tree_mod_log_lock);  } -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, -			    struct seq_list *elem) +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) +{ +	read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ +	write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) +{ +	write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, +			   struct seq_list *elem)  { -	elem->flags = 1; +	u64 seq; + +	tree_mod_log_write_lock(fs_info);  	spin_lock(&fs_info->tree_mod_seq_lock); -	__get_tree_mod_seq(fs_info, elem); +	if (!elem->seq) { +		elem->seq = btrfs_inc_tree_mod_seq(fs_info); +		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +	} +	seq = btrfs_inc_tree_mod_seq(fs_info);  	spin_unlock(&fs_info->tree_mod_seq_lock); +	tree_mod_log_write_unlock(fs_info); + +	return seq;  }  void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,  	if (!seq_putting)  		return; -	BUG_ON(!(elem->flags & 1));  	spin_lock(&fs_info->tree_mod_seq_lock);  	list_del(&elem->list); +	elem->seq = 0;  	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { -		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { +		if (cur_elem->seq < min_seq) {  			if (seq_putting > cur_elem->seq) {  				/*  				 * blocker with lower sequence number exists, we  				 * cannot remove anything from the log  				 */ -				goto out; +				spin_unlock(&fs_info->tree_mod_seq_lock); +				return;  			}  			min_seq = cur_elem->seq;  		}  	} +	spin_unlock(&fs_info->tree_mod_seq_lock); + +	/* +	 * we removed the lowest blocker from the blocker list, so there may be +	 * more processible delayed refs. +	 */ +	wake_up(&fs_info->tree_mod_seq_wait);  	/*  	 * anything that's lower than the lowest existing (read: blocked)  	 * sequence number can be removed from the tree.  	 */ -	write_lock(&fs_info->tree_mod_log_lock); +	tree_mod_log_write_lock(fs_info);  	tm_root = &fs_info->tree_mod_log;  	for (node = rb_first(tm_root); node; node = next) {  		next = rb_next(node);  		tm = container_of(node, struct tree_mod_elem, node); -		if (tm->elem.seq > min_seq) +		if (tm->seq > min_seq)  			continue;  		rb_erase(node, tm_root); -		list_del(&tm->elem.list);  		kfree(tm);  	} -	write_unlock(&fs_info->tree_mod_log_lock); -out: -	spin_unlock(&fs_info->tree_mod_seq_lock); +	tree_mod_log_write_unlock(fs_info);  }  /* @@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)  	struct rb_node **new;  	struct rb_node *parent = NULL;  	struct tree_mod_elem *cur; -	int ret = 0; -	BUG_ON(!tm || !tm->elem.seq); +	BUG_ON(!tm || !tm->seq); -	write_lock(&fs_info->tree_mod_log_lock);  	tm_root = &fs_info->tree_mod_log;  	new = &tm_root->rb_node;  	while (*new) { @@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)  			new = &((*new)->rb_left);  		else if (cur->index > tm->index)  			new = &((*new)->rb_right); -		else if (cur->elem.seq < tm->elem.seq) +		else if (cur->seq < tm->seq)  			new = &((*new)->rb_left); -		else if (cur->elem.seq > tm->elem.seq) +		else if (cur->seq > tm->seq)  			new = &((*new)->rb_right);  		else {  			kfree(tm); -			ret = -EEXIST; -			goto unlock; +			return -EEXIST;  		}  	}  	rb_link_node(&tm->node, parent, new);  	rb_insert_color(&tm->node, tm_root); -unlock: -	write_unlock(&fs_info->tree_mod_log_lock); -	return ret; +	return 0;  } +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */  static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,  				    struct extent_buffer *eb) {  	smp_mb();  	if (list_empty(&(fs_info)->tree_mod_seq_list))  		return 1; -	if (!eb) -		return 0; -	if (btrfs_header_level(eb) == 0) +	if (eb && btrfs_header_level(eb) == 0)  		return 1; + +	tree_mod_log_write_lock(fs_info); +	if (list_empty(&fs_info->tree_mod_seq_list)) { +		/* +		 * someone emptied the list while we were waiting for the lock. +		 * we must not add to the list when no blocker exists. +		 */ +		tree_mod_log_write_unlock(fs_info); +		return 1; +	} +  	return 0;  }  /* - * This allocates memory and gets a tree modification sequence number when - * needed. + * This allocates memory and gets a tree modification sequence number.   * - * Returns 0 when no sequence number is needed, < 0 on error. - * Returns 1 when a sequence number was added. In this case, - * fs_info->tree_mod_seq_lock was acquired and must be released by the caller - * after inserting into the rb tree. + * Returns <0 on error. + * Returns >0 (the added sequence number) on success.   */  static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,  				 struct tree_mod_elem **tm_ret)  {  	struct tree_mod_elem *tm; -	int seq; - -	if (tree_mod_dont_log(fs_info, NULL)) -		return 0; -	tm = *tm_ret = kzalloc(sizeof(*tm), flags); +	/* +	 * once we switch from spin locks to something different, we should +	 * honor the flags parameter here. +	 */ +	tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);  	if (!tm)  		return -ENOMEM; -	tm->elem.flags = 0; -	spin_lock(&fs_info->tree_mod_seq_lock); -	if (list_empty(&fs_info->tree_mod_seq_list)) { -		/* -		 * someone emptied the list while we were waiting for the lock. -		 * we must not add to the list, because no blocker exists. items -		 * are removed from the list only when the existing blocker is -		 * removed from the list. -		 */ -		kfree(tm); -		seq = 0; -		spin_unlock(&fs_info->tree_mod_seq_lock); -	} else { -		__get_tree_mod_seq(fs_info, &tm->elem); -		seq = tm->elem.seq; -	} - -	return seq; +	tm->seq = btrfs_inc_tree_mod_seq(fs_info); +	return tm->seq;  } -static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, -			     struct extent_buffer *eb, int slot, -			     enum mod_log_op op, gfp_t flags) +static inline int +__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, +			  struct extent_buffer *eb, int slot, +			  enum mod_log_op op, gfp_t flags)  { -	struct tree_mod_elem *tm;  	int ret; +	struct tree_mod_elem *tm;  	ret = tree_mod_alloc(fs_info, flags, &tm); -	if (ret <= 0) +	if (ret < 0)  		return ret;  	tm->index = eb->start >> PAGE_CACHE_SHIFT; @@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,  	tm->slot = slot;  	tm->generation = btrfs_node_ptr_generation(eb, slot); -	ret = __tree_mod_log_insert(fs_info, tm); -	spin_unlock(&fs_info->tree_mod_seq_lock); +	return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, +			     struct extent_buffer *eb, int slot, +			     enum mod_log_op op, gfp_t flags) +{ +	int ret; + +	if (tree_mod_dont_log(fs_info, eb)) +		return 0; + +	ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + +	tree_mod_log_write_unlock(fs_info);  	return ret;  } @@ -543,6 +583,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,  }  static noinline int +tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, +			     struct extent_buffer *eb, int slot, +			     enum mod_log_op op) +{ +	return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int  tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,  			 struct extent_buffer *eb, int dst_slot, int src_slot,  			 int nr_items, gfp_t flags) @@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,  		return 0;  	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { -		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, +		ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,  					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);  		BUG_ON(ret < 0);  	}  	ret = tree_mod_alloc(fs_info, flags, &tm); -	if (ret <= 0) -		return ret; +	if (ret < 0) +		goto out;  	tm->index = eb->start >> PAGE_CACHE_SHIFT;  	tm->slot = src_slot; @@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,  	tm->op = MOD_LOG_MOVE_KEYS;  	ret = __tree_mod_log_insert(fs_info, tm); -	spin_unlock(&fs_info->tree_mod_seq_lock); +out: +	tree_mod_log_write_unlock(fs_info);  	return ret;  } +static inline void +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ +	int i; +	u32 nritems; +	int ret; + +	nritems = btrfs_header_nritems(eb); +	for (i = nritems - 1; i >= 0; i--) { +		ret = tree_mod_log_insert_key_locked(fs_info, eb, i, +					      MOD_LOG_KEY_REMOVE_WHILE_FREEING); +		BUG_ON(ret < 0); +	} +} +  static noinline int  tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  			 struct extent_buffer *old_root, @@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  	struct tree_mod_elem *tm;  	int ret; +	if (tree_mod_dont_log(fs_info, NULL)) +		return 0; + +	__tree_mod_log_free_eb(fs_info, old_root); +  	ret = tree_mod_alloc(fs_info, flags, &tm); -	if (ret <= 0) -		return ret; +	if (ret < 0) +		goto out;  	tm->index = new_root->start >> PAGE_CACHE_SHIFT;  	tm->old_root.logical = old_root->start; @@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  	tm->op = MOD_LOG_ROOT_REPLACE;  	ret = __tree_mod_log_insert(fs_info, tm); -	spin_unlock(&fs_info->tree_mod_seq_lock); +out: +	tree_mod_log_write_unlock(fs_info);  	return ret;  } @@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,  	struct tree_mod_elem *found = NULL;  	u64 index = start >> PAGE_CACHE_SHIFT; -	read_lock(&fs_info->tree_mod_log_lock); +	tree_mod_log_read_lock(fs_info);  	tm_root = &fs_info->tree_mod_log;  	node = tm_root->rb_node;  	while (node) { @@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,  			node = node->rb_left;  		} else if (cur->index > index) {  			node = node->rb_right; -		} else if (cur->elem.seq < min_seq) { +		} else if (cur->seq < min_seq) {  			node = node->rb_left;  		} else if (!smallest) {  			/* we want the node with the highest seq */  			if (found) -				BUG_ON(found->elem.seq > cur->elem.seq); +				BUG_ON(found->seq > cur->seq);  			found = cur;  			node = node->rb_left; -		} else if (cur->elem.seq > min_seq) { +		} else if (cur->seq > min_seq) {  			/* we want the node with the smallest seq */  			if (found) -				BUG_ON(found->elem.seq < cur->elem.seq); +				BUG_ON(found->seq < cur->seq);  			found = cur;  			node = node->rb_right;  		} else { @@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,  			break;  		}  	} -	read_unlock(&fs_info->tree_mod_log_lock); +	tree_mod_log_read_unlock(fs_info);  	return found;  } @@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)  	return __tree_mod_log_search(fs_info, start, min_seq, 0);  } -static inline void +static noinline void  tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  		     struct extent_buffer *src, unsigned long dst_offset,  		     unsigned long src_offset, int nr_items) @@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  	if (tree_mod_dont_log(fs_info, NULL))  		return; -	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) +	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { +		tree_mod_log_write_unlock(fs_info);  		return; +	} -	/* speed this up by single seq for all operations? */  	for (i = 0; i < nr_items; i++) { -		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, -					      MOD_LOG_KEY_REMOVE); +		ret = tree_mod_log_insert_key_locked(fs_info, src, +						     i + src_offset, +						     MOD_LOG_KEY_REMOVE);  		BUG_ON(ret < 0); -		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, -					      MOD_LOG_KEY_ADD); +		ret = tree_mod_log_insert_key_locked(fs_info, dst, +						     i + dst_offset, +						     MOD_LOG_KEY_ADD);  		BUG_ON(ret < 0);  	} + +	tree_mod_log_write_unlock(fs_info);  }  static inline void @@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  	BUG_ON(ret < 0);  } -static inline void +static noinline void  tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,  			  struct extent_buffer *eb,  			  struct btrfs_disk_key *disk_key, int slot, int atomic) @@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,  	BUG_ON(ret < 0);  } -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, -				 struct extent_buffer *eb) +static noinline void +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)  { -	int i; -	int ret; -	u32 nritems; -  	if (tree_mod_dont_log(fs_info, eb))  		return; -	nritems = btrfs_header_nritems(eb); -	for (i = nritems - 1; i >= 0; i--) { -		ret = tree_mod_log_insert_key(fs_info, eb, i, -					      MOD_LOG_KEY_REMOVE_WHILE_FREEING); -		BUG_ON(ret < 0); -	} +	__tree_mod_log_free_eb(fs_info, eb); + +	tree_mod_log_write_unlock(fs_info);  } -static inline void +static noinline void  tree_mod_log_set_root_pointer(struct btrfs_root *root,  			      struct extent_buffer *new_root_node)  {  	int ret; -	tree_mod_log_free_eb(root->fs_info, root->node);  	ret = tree_mod_log_insert_root(root->fs_info, root->node,  				       new_root_node, GFP_NOFS);  	BUG_ON(ret < 0); @@ -1024,11 +1091,18 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,  		if (!looped && !tm)  			return 0;  		/* -		 * we must have key remove operations in the log before the -		 * replace operation. +		 * if there are no tree operation for the oldest root, we simply +		 * return it. this should only happen if that (old) root is at +		 * level 0.  		 */ -		BUG_ON(!tm); +		if (!tm) +			break; +		/* +		 * if there's an operation that's not a root replacement, we +		 * found the oldest version of our root. normally, we'll find a +		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. +		 */  		if (tm->op != MOD_LOG_ROOT_REPLACE)  			break; @@ -1062,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,  	unsigned long p_size = sizeof(struct btrfs_key_ptr);  	n = btrfs_header_nritems(eb); -	while (tm && tm->elem.seq >= time_seq) { +	while (tm && tm->seq >= time_seq) {  		/*  		 * all the operations are recorded with the operator used for  		 * the modification. as we're going backwards, we do the @@ -1087,11 +1161,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,  						      tm->generation);  			break;  		case MOD_LOG_KEY_ADD: -			if (tm->slot != n - 1) { -				o_dst = btrfs_node_key_ptr_offset(tm->slot); -				o_src = btrfs_node_key_ptr_offset(tm->slot + 1); -				memmove_extent_buffer(eb, o_dst, o_src, p_size); -			} +			/* if a move operation is needed it's in the log */  			n--;  			break;  		case MOD_LOG_MOVE_KEYS: @@ -1192,16 +1262,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)  	}  	tm = tree_mod_log_search(root->fs_info, logical, time_seq); -	/* -	 * there was an item in the log when __tree_mod_log_oldest_root -	 * returned. this one must not go away, because the time_seq passed to -	 * us must be blocking its removal. -	 */ -	BUG_ON(!tm); -  	if (old_root) -		eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, -					       root->nodesize); +		eb = alloc_dummy_extent_buffer(logical, root->nodesize);  	else  		eb = btrfs_clone_extent_buffer(root->node);  	btrfs_tree_read_unlock(root->node); @@ -1216,7 +1278,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)  		btrfs_set_header_level(eb, old_root->level);  		btrfs_set_header_generation(eb, old_generation);  	} -	__tree_mod_log_rewind(eb, time_seq, tm); +	if (tm) +		__tree_mod_log_rewind(eb, time_seq, tm); +	else +		WARN_ON(btrfs_header_level(eb) != 0);  	extent_buffer_get(eb);  	return eb; @@ -2724,6 +2789,80 @@ done:  }  /* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, +			       struct btrfs_key *key, struct btrfs_path *p, +			       int find_higher, int return_any) +{ +	int ret; +	struct extent_buffer *leaf; + +again: +	ret = btrfs_search_slot(NULL, root, key, p, 0, 0); +	if (ret <= 0) +		return ret; +	/* +	 * a return value of 1 means the path is at the position where the +	 * item should be inserted. Normally this is the next bigger item, +	 * but in case the previous item is the last in a leaf, path points +	 * to the first free slot in the previous leaf, i.e. at an invalid +	 * item. +	 */ +	leaf = p->nodes[0]; + +	if (find_higher) { +		if (p->slots[0] >= btrfs_header_nritems(leaf)) { +			ret = btrfs_next_leaf(root, p); +			if (ret <= 0) +				return ret; +			if (!return_any) +				return 1; +			/* +			 * no higher item found, return the next +			 * lower instead +			 */ +			return_any = 0; +			find_higher = 0; +			btrfs_release_path(p); +			goto again; +		} +	} else { +		if (p->slots[0] == 0) { +			ret = btrfs_prev_leaf(root, p); +			if (ret < 0) +				return ret; +			if (!ret) { +				p->slots[0] = btrfs_header_nritems(leaf) - 1; +				return 0; +			} +			if (!return_any) +				return 1; +			/* +			 * no lower item found, return the next +			 * higher instead +			 */ +			return_any = 0; +			find_higher = 1; +			btrfs_release_path(p); +			goto again; +		} else { +			--p->slots[0]; +		} +	} +	return 0; +} + +/*   * adjust the pointers going up the tree, starting at level   * making sure the right key of each node is points to 'key'.   * This is used after shifting pointers to the left, so it stops @@ -2995,7 +3134,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  static void insert_ptr(struct btrfs_trans_handle *trans,  		       struct btrfs_root *root, struct btrfs_path *path,  		       struct btrfs_disk_key *key, u64 bytenr, -		       int slot, int level, int tree_mod_log) +		       int slot, int level)  {  	struct extent_buffer *lower;  	int nritems; @@ -3008,7 +3147,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  	BUG_ON(slot > nritems);  	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));  	if (slot != nritems) { -		if (tree_mod_log && level) +		if (level)  			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,  					     slot, nritems - slot);  		memmove_extent_buffer(lower, @@ -3016,7 +3155,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,  			      btrfs_node_key_ptr_offset(slot),  			      (nritems - slot) * sizeof(struct btrfs_key_ptr));  	} -	if (tree_mod_log && level) { +	if (level) {  		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,  					      MOD_LOG_KEY_ADD);  		BUG_ON(ret < 0); @@ -3104,7 +3243,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(split);  	insert_ptr(trans, root, path, &disk_key, split->start, -		   path->slots[level + 1] + 1, level + 1, 1); +		   path->slots[level + 1] + 1, level + 1);  	if (path->slots[level] >= mid) {  		path->slots[level] -= mid; @@ -3641,7 +3780,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,  	btrfs_set_header_nritems(l, mid);  	btrfs_item_key(right, &disk_key, 0);  	insert_ptr(trans, root, path, &disk_key, right->start, -		   path->slots[1] + 1, 1, 0); +		   path->slots[1] + 1, 1);  	btrfs_mark_buffer_dirty(right);  	btrfs_mark_buffer_dirty(l); @@ -3848,7 +3987,7 @@ again:  		if (mid <= slot) {  			btrfs_set_header_nritems(right, 0);  			insert_ptr(trans, root, path, &disk_key, right->start, -				   path->slots[1] + 1, 1, 0); +				   path->slots[1] + 1, 1);  			btrfs_tree_unlock(path->nodes[0]);  			free_extent_buffer(path->nodes[0]);  			path->nodes[0] = right; @@ -3857,7 +3996,7 @@ again:  		} else {  			btrfs_set_header_nritems(right, 0);  			insert_ptr(trans, root, path, &disk_key, right->start, -					  path->slots[1], 1, 0); +					  path->slots[1], 1);  			btrfs_tree_unlock(path->nodes[0]);  			free_extent_buffer(path->nodes[0]);  			path->nodes[0] = right; @@ -4933,6 +5072,431 @@ out:  	return ret;  } +static void tree_move_down(struct btrfs_root *root, +			   struct btrfs_path *path, +			   int *level, int root_level) +{ +	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], +					path->slots[*level]); +	path->slots[*level - 1] = 0; +	(*level)--; +} + +static int tree_move_next_or_upnext(struct btrfs_root *root, +				    struct btrfs_path *path, +				    int *level, int root_level) +{ +	int ret = 0; +	int nritems; +	nritems = btrfs_header_nritems(path->nodes[*level]); + +	path->slots[*level]++; + +	while (path->slots[*level] == nritems) { +		if (*level == root_level) +			return -1; + +		/* move upnext */ +		path->slots[*level] = 0; +		free_extent_buffer(path->nodes[*level]); +		path->nodes[*level] = NULL; +		(*level)++; +		path->slots[*level]++; + +		nritems = btrfs_header_nritems(path->nodes[*level]); +		ret = 1; +	} +	return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_root *root, +			struct btrfs_path *path, +			int *level, int root_level, +			int allow_down, +			struct btrfs_key *key) +{ +	int ret; + +	if (*level == 0 || !allow_down) { +		ret = tree_move_next_or_upnext(root, path, level, root_level); +	} else { +		tree_move_down(root, path, level, root_level); +		ret = 0; +	} +	if (ret >= 0) { +		if (*level == 0) +			btrfs_item_key_to_cpu(path->nodes[*level], key, +					path->slots[*level]); +		else +			btrfs_node_key_to_cpu(path->nodes[*level], key, +					path->slots[*level]); +	} +	return ret; +} + +static int tree_compare_item(struct btrfs_root *left_root, +			     struct btrfs_path *left_path, +			     struct btrfs_path *right_path, +			     char *tmp_buf) +{ +	int cmp; +	int len1, len2; +	unsigned long off1, off2; + +	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); +	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); +	if (len1 != len2) +		return 1; + +	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); +	off2 = btrfs_item_ptr_offset(right_path->nodes[0], +				right_path->slots[0]); + +	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + +	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); +	if (cmp) +		return 1; +	return 0; +} + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +int btrfs_compare_trees(struct btrfs_root *left_root, +			struct btrfs_root *right_root, +			btrfs_changed_cb_t changed_cb, void *ctx) +{ +	int ret; +	int cmp; +	struct btrfs_trans_handle *trans = NULL; +	struct btrfs_path *left_path = NULL; +	struct btrfs_path *right_path = NULL; +	struct btrfs_key left_key; +	struct btrfs_key right_key; +	char *tmp_buf = NULL; +	int left_root_level; +	int right_root_level; +	int left_level; +	int right_level; +	int left_end_reached; +	int right_end_reached; +	int advance_left; +	int advance_right; +	u64 left_blockptr; +	u64 right_blockptr; +	u64 left_start_ctransid; +	u64 right_start_ctransid; +	u64 ctransid; + +	left_path = btrfs_alloc_path(); +	if (!left_path) { +		ret = -ENOMEM; +		goto out; +	} +	right_path = btrfs_alloc_path(); +	if (!right_path) { +		ret = -ENOMEM; +		goto out; +	} + +	tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); +	if (!tmp_buf) { +		ret = -ENOMEM; +		goto out; +	} + +	left_path->search_commit_root = 1; +	left_path->skip_locking = 1; +	right_path->search_commit_root = 1; +	right_path->skip_locking = 1; + +	spin_lock(&left_root->root_times_lock); +	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); +	spin_unlock(&left_root->root_times_lock); + +	spin_lock(&right_root->root_times_lock); +	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); +	spin_unlock(&right_root->root_times_lock); + +	trans = btrfs_join_transaction(left_root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		trans = NULL; +		goto out; +	} + +	/* +	 * Strategy: Go to the first items of both trees. Then do +	 * +	 * If both trees are at level 0 +	 *   Compare keys of current items +	 *     If left < right treat left item as new, advance left tree +	 *       and repeat +	 *     If left > right treat right item as deleted, advance right tree +	 *       and repeat +	 *     If left == right do deep compare of items, treat as changed if +	 *       needed, advance both trees and repeat +	 * If both trees are at the same level but not at level 0 +	 *   Compare keys of current nodes/leafs +	 *     If left < right advance left tree and repeat +	 *     If left > right advance right tree and repeat +	 *     If left == right compare blockptrs of the next nodes/leafs +	 *       If they match advance both trees but stay at the same level +	 *         and repeat +	 *       If they don't match advance both trees while allowing to go +	 *         deeper and repeat +	 * If tree levels are different +	 *   Advance the tree that needs it and repeat +	 * +	 * Advancing a tree means: +	 *   If we are at level 0, try to go to the next slot. If that's not +	 *   possible, go one level up and repeat. Stop when we found a level +	 *   where we could go to the next slot. We may at this point be on a +	 *   node or a leaf. +	 * +	 *   If we are not at level 0 and not on shared tree blocks, go one +	 *   level deeper. +	 * +	 *   If we are not at level 0 and on shared tree blocks, go one slot to +	 *   the right if possible or go up and right. +	 */ + +	left_level = btrfs_header_level(left_root->commit_root); +	left_root_level = left_level; +	left_path->nodes[left_level] = left_root->commit_root; +	extent_buffer_get(left_path->nodes[left_level]); + +	right_level = btrfs_header_level(right_root->commit_root); +	right_root_level = right_level; +	right_path->nodes[right_level] = right_root->commit_root; +	extent_buffer_get(right_path->nodes[right_level]); + +	if (left_level == 0) +		btrfs_item_key_to_cpu(left_path->nodes[left_level], +				&left_key, left_path->slots[left_level]); +	else +		btrfs_node_key_to_cpu(left_path->nodes[left_level], +				&left_key, left_path->slots[left_level]); +	if (right_level == 0) +		btrfs_item_key_to_cpu(right_path->nodes[right_level], +				&right_key, right_path->slots[right_level]); +	else +		btrfs_node_key_to_cpu(right_path->nodes[right_level], +				&right_key, right_path->slots[right_level]); + +	left_end_reached = right_end_reached = 0; +	advance_left = advance_right = 0; + +	while (1) { +		/* +		 * We need to make sure the transaction does not get committed +		 * while we do anything on commit roots. This means, we need to +		 * join and leave transactions for every item that we process. +		 */ +		if (trans && btrfs_should_end_transaction(trans, left_root)) { +			btrfs_release_path(left_path); +			btrfs_release_path(right_path); + +			ret = btrfs_end_transaction(trans, left_root); +			trans = NULL; +			if (ret < 0) +				goto out; +		} +		/* now rejoin the transaction */ +		if (!trans) { +			trans = btrfs_join_transaction(left_root); +			if (IS_ERR(trans)) { +				ret = PTR_ERR(trans); +				trans = NULL; +				goto out; +			} + +			spin_lock(&left_root->root_times_lock); +			ctransid = btrfs_root_ctransid(&left_root->root_item); +			spin_unlock(&left_root->root_times_lock); +			if (ctransid != left_start_ctransid) +				left_start_ctransid = 0; + +			spin_lock(&right_root->root_times_lock); +			ctransid = btrfs_root_ctransid(&right_root->root_item); +			spin_unlock(&right_root->root_times_lock); +			if (ctransid != right_start_ctransid) +				right_start_ctransid = 0; + +			if (!left_start_ctransid || !right_start_ctransid) { +				WARN(1, KERN_WARNING +					"btrfs: btrfs_compare_tree detected " +					"a change in one of the trees while " +					"iterating. This is probably a " +					"bug.\n"); +				ret = -EIO; +				goto out; +			} + +			/* +			 * the commit root may have changed, so start again +			 * where we stopped +			 */ +			left_path->lowest_level = left_level; +			right_path->lowest_level = right_level; +			ret = btrfs_search_slot(NULL, left_root, +					&left_key, left_path, 0, 0); +			if (ret < 0) +				goto out; +			ret = btrfs_search_slot(NULL, right_root, +					&right_key, right_path, 0, 0); +			if (ret < 0) +				goto out; +		} + +		if (advance_left && !left_end_reached) { +			ret = tree_advance(left_root, left_path, &left_level, +					left_root_level, +					advance_left != ADVANCE_ONLY_NEXT, +					&left_key); +			if (ret < 0) +				left_end_reached = ADVANCE; +			advance_left = 0; +		} +		if (advance_right && !right_end_reached) { +			ret = tree_advance(right_root, right_path, &right_level, +					right_root_level, +					advance_right != ADVANCE_ONLY_NEXT, +					&right_key); +			if (ret < 0) +				right_end_reached = ADVANCE; +			advance_right = 0; +		} + +		if (left_end_reached && right_end_reached) { +			ret = 0; +			goto out; +		} else if (left_end_reached) { +			if (right_level == 0) { +				ret = changed_cb(left_root, right_root, +						left_path, right_path, +						&right_key, +						BTRFS_COMPARE_TREE_DELETED, +						ctx); +				if (ret < 0) +					goto out; +			} +			advance_right = ADVANCE; +			continue; +		} else if (right_end_reached) { +			if (left_level == 0) { +				ret = changed_cb(left_root, right_root, +						left_path, right_path, +						&left_key, +						BTRFS_COMPARE_TREE_NEW, +						ctx); +				if (ret < 0) +					goto out; +			} +			advance_left = ADVANCE; +			continue; +		} + +		if (left_level == 0 && right_level == 0) { +			cmp = btrfs_comp_cpu_keys(&left_key, &right_key); +			if (cmp < 0) { +				ret = changed_cb(left_root, right_root, +						left_path, right_path, +						&left_key, +						BTRFS_COMPARE_TREE_NEW, +						ctx); +				if (ret < 0) +					goto out; +				advance_left = ADVANCE; +			} else if (cmp > 0) { +				ret = changed_cb(left_root, right_root, +						left_path, right_path, +						&right_key, +						BTRFS_COMPARE_TREE_DELETED, +						ctx); +				if (ret < 0) +					goto out; +				advance_right = ADVANCE; +			} else { +				ret = tree_compare_item(left_root, left_path, +						right_path, tmp_buf); +				if (ret) { +					ret = changed_cb(left_root, right_root, +						left_path, right_path, +						&left_key, +						BTRFS_COMPARE_TREE_CHANGED, +						ctx); +					if (ret < 0) +						goto out; +				} +				advance_left = ADVANCE; +				advance_right = ADVANCE; +			} +		} else if (left_level == right_level) { +			cmp = btrfs_comp_cpu_keys(&left_key, &right_key); +			if (cmp < 0) { +				advance_left = ADVANCE; +			} else if (cmp > 0) { +				advance_right = ADVANCE; +			} else { +				left_blockptr = btrfs_node_blockptr( +						left_path->nodes[left_level], +						left_path->slots[left_level]); +				right_blockptr = btrfs_node_blockptr( +						right_path->nodes[right_level], +						right_path->slots[right_level]); +				if (left_blockptr == right_blockptr) { +					/* +					 * As we're on a shared block, don't +					 * allow to go deeper. +					 */ +					advance_left = ADVANCE_ONLY_NEXT; +					advance_right = ADVANCE_ONLY_NEXT; +				} else { +					advance_left = ADVANCE; +					advance_right = ADVANCE; +				} +			} +		} else if (left_level < right_level) { +			advance_right = ADVANCE; +		} else { +			advance_left = ADVANCE; +		} +	} + +out: +	btrfs_free_path(left_path); +	btrfs_free_path(right_path); +	kfree(tmp_buf); + +	if (trans) { +		if (!ret) +			ret = btrfs_end_transaction(trans, left_root); +		else +			btrfs_end_transaction(trans, left_root); +	} + +	return ret; +} +  /*   * this is similar to btrfs_next_leaf, but does not try to preserve   * and fixup the path.  It looks for and returns the next key in the @@ -5121,6 +5685,19 @@ again:  		if (!path->skip_locking) {  			ret = btrfs_try_tree_read_lock(next); +			if (!ret && time_seq) { +				/* +				 * If we don't get the lock, we may be racing +				 * with push_leaf_left, holding that lock while +				 * itself waiting for the leaf we've currently +				 * locked. To solve this situation, we give up +				 * on our lock and cycle. +				 */ +				free_extent_buffer(next); +				btrfs_release_path(path); +				cond_resched(); +				goto again; +			}  			if (!ret) {  				btrfs_set_path_blocking(path);  				btrfs_tree_read_lock(next);  | 
