summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c241
-rw-r--r--fs/btrfs/ctree.h184
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c142
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c229
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c27
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c422
-rw-r--r--fs/btrfs/free-space-cache.c51
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c484
-rw-r--r--fs/btrfs/ioctl.c317
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1836
-rw-r--r--fs/btrfs/send.c8
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/transaction.c170
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c477
-rw-r--r--fs/btrfs/volumes.c966
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
42 files changed, 5267 insertions, 1758 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
return ret;
+ if (ret == 0)
+ acl = NULL;
}
ret = 0;
break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
pos2 = n2, n2 = pos2->next) {
struct __prelim_ref *ref2;
struct __prelim_ref *xchg;
+ struct extent_inode_elem *eie;
ref2 = list_entry(pos2, struct __prelim_ref, list);
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
ref1 = ref2;
ref2 = xchg;
}
- ref1->count += ref2->count;
} else {
if (ref1->parent != ref2->parent)
continue;
- ref1->count += ref2->count;
}
+
+ eie = ref1->inode_list;
+ while (eie && eie->next)
+ eie = eie->next;
+ if (eie)
+ eie->next = ref2->inode_list;
+ else
+ ref1->inode_list = ref2->inode_list;
+ ref1->count += ref2->count;
+
list_del(&ref2->list);
kfree(ref2);
}
@@ -890,8 +899,7 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
list_del(&ref->list);
- if (ref->count < 0)
- WARN_ON(1);
+ WARN_ON(ref->count < 0);
if (ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
#define BTRFS_INODE_NEEDS_FULL_SYNC 7
+#define BTRFS_INODE_COPY_EVERYTHING 8
/* in memory btrfs inode */
struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
unsigned long runtime_flags;
+ /* Keep track of who's O_SYNC/fsycing currently */
+ atomic_t sync_writers;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
unsigned int never_written:1; /* block was added because it was
* referenced, not because it was
* written */
- unsigned int mirror_num:2; /* large enough to hold
+ unsigned int mirror_num; /* large enough to hold
* BTRFS_SUPER_MIRROR_MAX */
struct btrfsic_dev_state *dev_state;
u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
*next_blockp = NULL;
if (0 == *num_copiesp) {
*num_copiesp =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
chunk_len = num_bytes;
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_device *device;
length = len;
- ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ ret = btrfs_map_block(state->root->fs_info, READ,
bytenr, &length, &multi, mirror_num);
+ if (ret) {
+ block_ctx_out->start = 0;
+ block_ctx_out->dev_bytenr = 0;
+ block_ctx_out->len = 0;
+ block_ctx_out->dev = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ return ret;
+ }
+
device = multi->stripes[0].dev;
block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
block_ctx_out->pagev = NULL;
block_ctx_out->mem_to_free = NULL;
- if (0 == ret)
- kfree(multi);
+ kfree(multi);
if (NULL == block_ctx_out->dev) {
ret = -ENXIO;
printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
struct btrfsic_block_data_ctx block_ctx;
int match = 0;
- num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(state->root->fs_info,
bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log);
+ struct btrfs_path *path, int level, int slot);
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
static noinline void
tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int slot, int atomic)
+ struct extent_buffer *eb, int slot, int atomic)
{
int ret;
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
+ n++;
+ case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
- n++;
break;
case MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
- if (trans->transaction != root->fs_info->running_transaction) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)
root->fs_info->running_transaction->transid);
- WARN_ON(1);
- }
- if (trans->transid != root->fs_info->generation) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+
+ if (trans->transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (cache_only && parent_level != 1)
return 0;
- if (trans->transaction != root->fs_info->running_transaction)
- WARN_ON(1);
- if (trans->transid != root->fs_info->generation)
- WARN_ON(1);
+ WARN_ON(trans->transaction != root->fs_info->running_transaction);
+ WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+ del_ptr(trans, root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &right_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot, 1);
+ del_ptr(trans, root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ tree_mod_log_set_node_key(root->fs_info, parent,
pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot, 0);
+ pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
int no_skips = 0;
struct extent_buffer *t;
+ if (path->really_keep_locks)
+ return;
+
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
int i;
- if (path->keep_locks)
+ if (path->keep_locks || path->really_keep_locks)
return;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (!cow)
write_lock_level = -1;
- if (cow && (p->keep_locks || p->lowest_level))
+ if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
* must have write locks on this node and the
* parent
*/
- if (level + 1 > write_lock_level) {
+ if (level > write_lock_level ||
+ (level + 1 > write_lock_level &&
+ level + 1 < BTRFS_MAX_LEVEL &&
+ p->nodes[level + 1])) {
write_lock_level = level + 1;
btrfs_release_path(p);
goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+ tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
+ struct btrfs_item *start_item;
+ struct btrfs_item *end_item;
+ struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- data_len = btrfs_item_end_nr(l, start);
- data_len = data_len - btrfs_item_offset_nr(l, end);
+ btrfs_init_map_token(&token);
+ start_item = btrfs_item_nr(l, start);
+ end_item = btrfs_item_nr(l, end);
+ data_len = btrfs_token_item_offset(l, start_item, &token) +
+ btrfs_token_item_size(l, start_item, &token);
+ data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (push_items == 0)
goto out_unlock;
- if (!empty && push_items == left_nritems)
- WARN_ON(1);
+ WARN_ON(!empty && push_items == left_nritems);
/* push left to right */
right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
- if (push_items > right_nritems) {
- printk(KERN_CRIT "push items %d nr %u\n", push_items,
+ if (push_items > right_nritems)
+ WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
right_nritems);
- WARN_ON(1);
- }
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* empty a node.
*/
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log)
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
int ret;
+ if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
+ }
+
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
- if (tree_mod_log && level)
+ if (level)
tree_mod_log_eb_move(root->fs_info, parent, slot,
slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
- } else if (tree_mod_log && level) {
- ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
- MOD_LOG_KEY_REMOVE);
- BUG_ON(ret < 0);
}
nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1], 1);
+ del_ptr(trans, root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->search_commit_root = 1;
right_path->skip_locking = 1;
- spin_lock(&left_root->root_times_lock);
+ spin_lock(&left_root->root_item_lock);
left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_times_lock);
+ spin_unlock(&left_root->root_item_lock);
- spin_lock(&right_root->root_times_lock);
+ spin_lock(&right_root->root_item_lock);
right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_times_lock);
+ spin_unlock(&right_root->root_item_lock);
trans = btrfs_join_transaction(left_root);
if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- spin_lock(&left_root->root_times_lock);
+ spin_lock(&left_root->root_item_lock);
ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_times_lock);
+ spin_unlock(&left_root->root_item_lock);
if (ctransid != left_start_ctransid)
left_start_ctransid = 0;
- spin_lock(&right_root->root_times_lock);
+ spin_lock(&right_root->root_item_lock);
ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_times_lock);
+ spin_unlock(&right_root->root_item_lock);
if (ctransid != right_start_ctransid)
right_start_ctransid = 0;
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
return btrfs_next_old_leaf(root, path, 0);
}
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+ int i;
+
+ for (i = 0; i < level; i++) {
+ path->slots[i] = 0;
+ if (!path->nodes[i])
+ continue;
+ if (path->locks[i]) {
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+}
+
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del)
+{
+ struct extent_buffer *b;
+ struct btrfs_key key;
+ u32 nritems;
+ int level = 1;
+ int slot;
+ int ret = 1;
+ int write_lock_level = BTRFS_MAX_LEVEL;
+ int ins_len = del ? -1 : 0;
+
+ WARN_ON(!(path->keep_locks || path->really_keep_locks));
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+ while (path->nodes[level]) {
+ nritems = btrfs_header_nritems(path->nodes[level]);
+ if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path,
+ ins_len, 1);
+ if (ret < 0)
+ goto out;
+ level = 1;
+ continue;
+ }
+
+ if (path->slots[level] >= nritems - 1) {
+ level++;
+ continue;
+ }
+
+ btrfs_release_level(path, level);
+ break;
+ }
+
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto out;
+ }
+
+ path->slots[level]++;
+ b = path->nodes[level];
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ btrfs_set_path_blocking(path);
+ ret = btrfs_cow_block(trans, root, b,
+ path->nodes[level + 1],
+ path->slots[level + 1], &b);
+ if (ret)
+ goto out;
+cow_done:
+ path->nodes[level] = b;
+ btrfs_clear_path_blocking(path, NULL, 0);
+ if (level != 0) {
+ ret = setup_nodes_for_search(trans, root, path, b,
+ level, ins_len,
+ &write_lock_level);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+
+ b = path->nodes[level];
+ slot = path->slots[level];
+
+ ret = read_block_for_search(trans, root, path,
+ &b, level, slot, &key, 0);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+ level = btrfs_header_level(b);
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(path, b,
+ BTRFS_WRITE_LOCK);
+ }
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ path->nodes[level] = b;
+ path->slots[level] = 0;
+ } else {
+ path->slots[level] = 0;
+ ret = 0;
+ break;
+ }
+ }
+
+out:
+ if (ret)
+ btrfs_release_path(path);
+
+ return ret;
+}
+
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
#define BTRFS_MAX_LEVEL 8
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+#define BTRFS_DEV_REPLACE_DEVID 0
+
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS (1 << 30)
+
#define BTRFS_FT_UNKNOWN 0
#define BTRFS_FT_REG_FILE 1
#define BTRFS_FT_DIR 2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
__le64 bytes_used;
__le64 num_devices;
/* future */
- __le64 unsed_64[4];
+ __le64 unused_64[4];
u8 tree_root_level;
u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int really_keep_locks:1;
};
/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
} __attribute__ ((__packed__));
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
+
+struct btrfs_dev_replace {
+ u64 replace_state; /* see #define above */
+ u64 time_started; /* seconds since 1-Jan-1970 */
+ u64 time_stopped; /* seconds since 1-Jan-1970 */
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ u64 cont_reading_from_srcdev_mode; /* see #define above */
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ pid_t lock_owner;
+ atomic_t nesting_level;
+ struct mutex lock_finishing_cancel_unmount;
+ struct mutex lock_management_lock;
+ struct mutex lock;
+
+ struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 src_devid;
+ __le64 cursor_left;
+ __le64 cursor_right;
+ __le64 cont_reading_from_srcdev_mode;
+
+ __le64 replace_state;
+ __le64 time_started;
+ __le64 time_stopped;
+ __le64 num_write_errors;
+ __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
+ struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+ struct btrfs_workers scrub_wr_completion_workers;
+ struct btrfs_workers scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
int backup_root_index;
int num_tolerated_disk_barrier_failures;
+
+ /* device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ atomic_t mutually_exclusive_operation_running;
};
/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
int force_cow;
- spinlock_t root_times_lock;
+ spinlock_t root_item_lock;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_STATS_KEY 249
/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY 250
+
+/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
static inline void btrfs_init_map_token (struct btrfs_map_token *token)
{
- memset(token, 0, sizeof(*token));
+ token->kaddr = NULL;
}
/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
rsv_excl, 64);
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+ /*
+ * Flushing delalloc may cause deadlock somewhere, in this
+ * case, use FLUSH LIMIT
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+ BTRFS_RESERVE_FLUSH_ALL,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
/* inode.c */
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ int wait;
+ int delay_iput;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
/* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
- struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
void btrfs_scrub_pause(struct btrfs_root *root);
void btrfs_scrub_pause_super(struct btrfs_root *root);
void btrfs_scrub_continue(struct btrfs_root *root);
void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+ struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* Since we're under a transaction reserve_metadata_bytes could
* try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
* reserve something strictly for us. If not be a pain and try
* to steal from the delalloc block rsv.
*/
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
goto out;
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
struct btrfs_delayed_node *delayed_node = NULL;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
- unsigned long nr = 0;
int need_requeue = 0;
int ret;
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
delayed_node);
mutex_unlock(&delayed_node->mutex);
- nr = trans->blocks_used;
-
trans->block_rsv = block_rsv;
btrfs_end_transaction_dmeta(trans, root);
- __btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty_nodelay(root);
free_path:
btrfs_free_path(path);
out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ int item_size;
+ struct btrfs_dev_replace_item *ptr;
+ u64 src_devid;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+no_valid_dev_replace_entry_found:
+ ret = 0;
+ dev_replace->replace_state =
+ BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+ dev_replace->cont_reading_from_srcdev_mode =
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+ dev_replace->replace_state = 0;
+ dev_replace->time_started = 0;
+ dev_replace->time_stopped = 0;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ dev_replace->is_valid = 0;
+ dev_replace->item_needs_writeback = 0;
+ goto out;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+ pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+ goto no_valid_dev_replace_entry_found;
+ }
+
+ src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+ dev_replace->cont_reading_from_srcdev_mode =
+ btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+ dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+ dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+ dev_replace->time_stopped =
+ btrfs_dev_replace_time_stopped(eb, ptr);
+ atomic64_set(&dev_replace->num_write_errors,
+ btrfs_dev_replace_num_write_errors(eb, ptr));
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+ btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+ dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+ dev_replace->committed_cursor_left = dev_replace->cursor_left;
+ dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+ dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+ dev_replace->is_valid = 1;
+
+ dev_replace->item_needs_writeback = 0;
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+ NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info,
+ BTRFS_DEV_REPLACE_DEVID,
+ NULL, NULL);
+ /*
+ * allow 'btrfs dev replace_cancel' if src/tgt device is
+ * missing
+ */
+ if (!dev_replace->srcdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+ (unsigned long long)src_devid);
+ }
+ if (!dev_replace->tgtdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+ (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+ }
+ if (dev_replace->tgtdev) {
+ if (dev_replace->srcdev) {
+ dev_replace->tgtdev->total_bytes =
+ dev_replace->srcdev->total_bytes;
+ dev_replace->tgtdev->disk_total_bytes =
+ dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->bytes_used =
+ dev_replace->srcdev->bytes_used;
+ }
+ dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+ dev_replace->tgtdev);
+ }
+ break;
+ }
+
+out:
+ if (path)
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_replace_item *ptr;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (!dev_replace->is_valid ||
+ !dev_replace->item_needs_writeback) {
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+ ret);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+ * dev_replace state is 'running', the data on the target
+ * drive is lost.
+ * It would be possible to recover the state: just make sure
+ * that the beginning of the item is never changed and always
+ * contains all the essential information. Then read this
+ * minimal set of information and use it as a base for the
+ * new state.
+ */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ pr_warn("btrfs: insert dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_dev_replace_item);
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (dev_replace->srcdev)
+ btrfs_set_dev_replace_src_devid(eb, ptr,
+ dev_replace->srcdev->devid);
+ else
+ btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+ btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+ dev_replace->cont_reading_from_srcdev_mode);
+ btrfs_set_dev_replace_replace_state(eb, ptr,
+ dev_replace->replace_state);
+ btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+ btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+ btrfs_set_dev_replace_num_write_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_write_errors));
+ btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+ dev_replace->cursor_left_last_write_of_item =
+ dev_replace->cursor_left;
+ btrfs_set_dev_replace_cursor_left(eb, ptr,
+ dev_replace->cursor_left_last_write_of_item);
+ btrfs_set_dev_replace_cursor_right(eb, ptr,
+ dev_replace->cursor_right);
+ dev_replace->item_needs_writeback = 0;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ dev_replace->committed_cursor_left =
+ dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+ struct timespec t = CURRENT_TIME_SEC;
+
+ return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ mutex_lock(&fs_info->volume_mutex);
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ &tgt_device);
+ if (ret) {
+ pr_err("btrfs: target device %s is invalid!\n",
+ args->start.tgtdev_name);
+ mutex_unlock(&fs_info->volume_mutex);
+ return -EINVAL;
+ }
+
+ ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+ args->start.srcdev_name,
+ &src_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret) {
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ if (tgt_device->total_bytes < src_device->total_bytes) {
+ pr_err("btrfs: target device is smaller than source device!\n");
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ goto leave;
+ }
+
+ dev_replace->cont_reading_from_srcdev_mode =
+ args->start.cont_reading_from_srcdev_mode;
+ WARN_ON(!src_device);
+ dev_replace->srcdev = src_device;
+ WARN_ON(!tgt_device);
+ dev_replace->tgtdev = tgt_device;
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+
+ /*
+ * from now on, the writes to the srcdev are all duplicated to
+ * go to the tgtdev as well (refer to btrfs_map_block()).
+ */
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ dev_replace->time_started = btrfs_get_seconds_since_1970();
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->is_valid = 1;
+ dev_replace->item_needs_writeback = 1;
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_wait_ordered_extents(root, 0);
+
+ /* force writing the updated state information to disk */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_dev_replace_lock(dev_replace);
+ goto leave;
+ }
+
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* the disk copy procedure reuses the scrub code */
+ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+ src_device->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+
+ ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+ WARN_ON(ret);
+
+ return 0;
+
+leave:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device;
+ struct btrfs_device *src_device;
+ struct btrfs_root *root = fs_info->tree_root;
+ u8 uuid_tmp[BTRFS_UUID_SIZE];
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ /* don't allow cancel or unmount to disturb the finishing procedure */
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* was the operation canceled, or is it finished? */
+ if (dev_replace->replace_state !=
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return 0;
+ }
+
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret)
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+
+ /*
+ * flush all outstanding I/O and inode extent mappings before the
+ * copy operation is declared as being finished
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+ : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+
+ if (scrub_ret) {
+ printk_in_rcu(KERN_ERR
+ "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+ }
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+ tgt_device->is_tgtdev_for_dev_replace = 0;
+ tgt_device->devid = src_device->devid;
+ src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+ tgt_device->bytes_used = src_device->bytes_used;
+ memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+ memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+ memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+ if (fs_info->sb->s_bdev == src_device->bdev)
+ fs_info->sb->s_bdev = tgt_device->bdev;
+ if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+ fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+ list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+ btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ if (src_device->bdev) {
+ /* zero out the old super */
+ btrfs_scratch_superblock(src_device);
+ }
+ /*
+ * this is again a consistent state where no dev_replace procedure
+ * is running, the target device is part of the filesystem, the
+ * source device is not part of the filesystem anymore and its 1st
+ * superblock is scratched out so that it is no longer marked to
+ * belong to this filesystem.
+ */
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ /* write back the superblocks */
+ trans = btrfs_start_transaction(root, 0);
+ if (!IS_ERR(trans))
+ btrfs_commit_transaction(trans, root);
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (srcdevid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+ device);
+ }
+ return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* even if !dev_replace_is_valid, the values are good enough for
+ * the replace_status ioctl */
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ args->status.replace_state = dev_replace->replace_state;
+ args->status.time_started = dev_replace->time_started;
+ args->status.time_stopped = dev_replace->time_stopped;
+ args->status.num_write_errors =
+ atomic64_read(&dev_replace->num_write_errors);
+ args->status.num_uncorrectable_read_errors =
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ args->status.progress_1000 = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ args->status.progress_1000 = 1000;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+ div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ break;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ args->result = __btrfs_dev_replace_cancel(fs_info);
+ return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->tree_root;
+ u64 result;
+ int ret;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ btrfs_dev_replace_unlock(dev_replace);
+ goto leave;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ tgt_device = dev_replace->tgtdev;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ break;
+ }
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(dev_replace);
+ btrfs_scrub_cancel(fs_info);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ pr_info("btrfs: suspending dev_replace for unmount\n");
+ break;
+ }
+
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ break;
+ }
+ if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+ pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+ "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ WARN_ON(atomic_xchg(
+ &fs_info->mutually_exclusive_operation_running, 1));
+ task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+ return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_ioctl_dev_replace_args *status_args;
+ u64 progress;
+
+ status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+ if (status_args) {
+ btrfs_dev_replace_status(fs_info, status_args);
+ progress = status_args->status.progress_1000;
+ kfree(status_args);
+ do_div(progress, 10);
+ printk_in_rcu(KERN_INFO
+ "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ dev_replace->srcdev->missing ? "<missing disk>" :
+ rcu_str_deref(dev_replace->srcdev->name),
+ dev_replace->srcdev->devid,
+ dev_replace->tgtdev ?
+ rcu_str_deref(dev_replace->tgtdev->name) :
+ "<missing target disk>",
+ (unsigned int)progress);
+ }
+ btrfs_dev_replace_continue_on_mount(fs_info);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+ return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+
+ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+ dev_replace->committed_cursor_left,
+ dev_replace->srcdev->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ WARN_ON(ret);
+ return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+ if (!dev_replace->is_valid)
+ return 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * return true even if tgtdev is missing (this is
+ * something that can happen if the dev_replace
+ * procedure is suspended by an umount and then
+ * the tgtdev is missing (or "btrfs dev scan") was
+ * not called and the the filesystem is remounted
+ * in degraded state. This does not stop the
+ * dev_replace procedure. It needs to be canceled
+ * manually if the cancelation is wanted.
+ */
+ break;
+ }
+ return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+ /* the beginning is just an optimization for the typical case */
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+ /* this is not a nested case where the same thread
+ * is trying to acqurire the same lock twice */
+ mutex_lock(&dev_replace->lock);
+ mutex_lock(&dev_replace->lock_management_lock);
+ dev_replace->lock_owner = current->pid;
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_lock(&dev_replace->lock_management_lock);
+ if (atomic_read(&dev_replace->nesting_level) > 0 &&
+ dev_replace->lock_owner == current->pid) {
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_unlock(&dev_replace->lock_management_lock);
+ goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ mutex_lock(&dev_replace->lock_management_lock);
+ WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+ WARN_ON(dev_replace->lock_owner != current->pid);
+ atomic_dec(&dev_replace->nesting_level);
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+ dev_replace->lock_owner = 0;
+ mutex_unlock(&dev_replace->lock_management_lock);
+ mutex_unlock(&dev_replace->lock);
+ } else {
+ mutex_unlock(&dev_replace->lock_management_lock);
+ }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+ atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ int data_size;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_path *path;
+
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ /* return back any errors */
+ if (ret < 0)
+ goto out;
+
+ /* nothing found, we're safe */
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /* we found an item, look for our name in the item */
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di) {
+ /* our exact name was found */
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * see if there is room in the item to insert this
+ * name
+ */
+ data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (data_size + btrfs_item_size_nr(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+ ret = -EOVERFLOW;
+ } else {
+ /* plenty of insertion room */
+ ret = 0;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* lookup a directory item based on index. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22a0439e5a86..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "dev-replace.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
break;
- num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(root->fs_info,
eb->start, eb->len);
if (num_copies == 1)
break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int ret;
+
/*
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
}
static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int ret;
if (!(rw & REQ_WRITE)) {
-
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
bio, 1);
if (ret)
- return ret;
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
- return ret;
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else {
+ /*
+ * kthread helpers are used to submit writes so that
+ * checksumming can happen in parallel across all CPUs
+ */
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ bio_offset,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
}
- /*
- * kthread helpers are used to submit writes so that checksumming
- * can happen in parallel across all CPUs
- */
- return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num, 0,
- bio_offset,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ if (ret) {
+out_w_error:
+ bio_endio(bio, ret);
+ }
+ return ret;
}
#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
static int btree_set_page_dirty(struct page *page)
{
+#ifdef DEBUG
struct extent_buffer *eb;
BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
+#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
root->fs_info->dirty_metadata_bytes);
}
spin_unlock(&root->fs_info->delalloc_lock);
- }
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(buf);
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(buf);
+ }
}
}
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->root_key.objectid = objectid;
root->anon_dev = 0;
- spin_lock_init(&root->root_times_lock);
+ spin_lock_init(&root->root_item_lock);
}
static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
init_rwsem(&fs_info->extent_commit_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
spin_lock_init(&fs_info->qgroup_lock);
fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->delayed_workers);
ret |= btrfs_start_workers(&fs_info->caching_workers);
ret |= btrfs_start_workers(&fs_info->readahead_workers);
+ ret |= btrfs_start_workers(&fs_info->flush_workers);
if (ret) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
goto fail_tree_roots;
}
- btrfs_close_extra_devices(fs_devices);
+ /*
+ * keep the device that is marked to be the target device for the
+ * dev_replace procedure
+ */
+ btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
goto fail_block_groups;
}
+ ret = btrfs_init_dev_replace(fs_info);
+ if (ret) {
+ pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
ret = btrfs_init_space_info(fs_info);
if (ret) {
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
}
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(sb->s_flags & MS_RDONLY)) {
+ printk(KERN_WARNING
+ "Btrfs: too many missing devices, writeable mount is not allowed\n");
+ goto fail_block_groups;
+ }
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
return ret;
}
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ pr_warn("btrfs: failed to resume dev_replace\n");
+ close_ctree(tree_root);
+ return ret;
+ }
+
return 0;
fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
/* pause restriper - we want to resume on mount */
- btrfs_pause_balance(root->fs_info);
+ btrfs_pause_balance(fs_info);
- btrfs_scrub_cancel(root);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+ btrfs_scrub_cancel(fs_info);
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(fs_info);
+ btrfs_cleanup_defrag_inodes(fs_info);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
int was_dirty;
btrfs_assert_tree_locked(buf);
- if (transid != root->fs_info->generation) {
- printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ if (transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
"found %llu running %llu\n",
(unsigned long long)buf->start,
(unsigned long long)transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
}
}
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+ int flush_delayed)
{
/*
* looks as though older kernels can get into trouble with
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
if (current->flags & PF_MEMALLOC)
return;
- btrfs_balance_delayed_items(root);
+ if (flush_delayed)
+ btrfs_balance_delayed_items(root);
num_dirty = root->fs_info->dirty_metadata_bytes;
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
return;
}
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
{
- /*
- * looks as though older kernels can get into trouble with
- * this code, they end up stuck in balance_dirty_pages forever
- */
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
-
- if (current->flags & PF_MEMALLOC)
- return;
-
- num_dirty = root->fs_info->dirty_metadata_bytes;
+ __btrfs_btree_balance_dirty(root, 1);
+}
- if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited(
- root->fs_info->btree_inode->i_mapping);
- }
- return;
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+ __btrfs_btree_balance_dirty(root, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "volumes.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "math.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-static u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- do_div(num, 10);
- return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- do_div(num, 100);
- return num;
-}
-
u64 btrfs_find_block_group(struct btrfs_root *root,
u64 search_start, u64 search_hint, int owner)
{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+ ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
kfree(extent_op);
if (ret) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+
printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
count++;
if (ret) {
+ if (locked_ref) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ }
printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
@@ -3661,7 +3651,7 @@ out:
static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
- int flush)
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile = btrfs_get_alloc_profile(root, 0);
u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
avail >>= 1;
/*
- * If we aren't flushing don't let us overcommit too much, say
- * 1/8th of the space. If we can flush, let it overcommit up to
- * 1/2 of the space.
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
*/
- if (flush)
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
avail >>= 3;
else
avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
return 0;
}
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+ unsigned long nr_pages,
+ enum wb_reason reason)
+{
+ if (!writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, reason);
+ up_read(&sb->s_umount);
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* shrink metadata reservation for delalloc
*/
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
long time_left;
unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
+ enum btrfs_reserve_flush_enum flush;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
+ writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+ nr_pages,
+ WB_REASON_FS_FREE_SPACE);
/*
* We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
wait_event(root->fs_info->async_submit_wait,
!atomic_read(&root->fs_info->async_delalloc_pages));
+ if (!trans)
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ else
+ flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (can_overcommit(root, space_info, orig, !trans)) {
+ if (can_overcommit(root, space_info, orig, flush)) {
spin_unlock(&space_info->lock);
break;
}
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
* @root - the root we're allocating for
* @block_rsv - the block_rsv we're allocating for
* @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
*
* This will reserve orgi_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
*/
static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes, int flush)
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 used;
@@ -3912,10 +3923,11 @@ again:
ret = 0;
spin_lock(&space_info->lock);
/*
- * We only want to wait if somebody other than us is flushing and we are
- * actually alloed to flush.
+ * We only want to wait if somebody other than us is flushing and we
+ * are actually allowed to flush all things.
*/
- while (flush && !flushing && space_info->flush) {
+ while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+ space_info->flush) {
spin_unlock(&space_info->lock);
/*
* If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
* stealing it from us.
+ *
+ * We make the other tasks wait for the flush only when we can flush
+ * all things.
*/
- if (ret && flush) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
flushing = true;
space_info->flush = 1;
}
spin_unlock(&space_info->lock);
- if (!ret || !flush)
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
goto out;
ret = flush_space(root, space_info, num_bytes, orig_bytes,
flush_state);
flush_state++;
+
+ /*
+ * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+ * would happen. So skip delalloc flush.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT))
+ flush_state = ALLOC_CHUNK;
+
if (!ret)
goto again;
- else if (flush_state <= COMMIT_TRANS)
+ else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ flush_state < COMMIT_TRANS)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ flush_state <= COMMIT_TRANS)
goto again;
out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
-static inline int __block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor)
{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 csum_bytes;
unsigned nr_extents = 0;
int extra_reserve = 0;
- int flush = 1;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret;
+ bool delalloc_lock = true;
- /* Need to be holding the i_mutex here if we aren't free space cache */
- if (btrfs_is_free_space_inode(inode))
- flush = 0;
+ /* If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ delalloc_lock = false;
+ }
- if (flush && btrfs_transaction_in_commit(root->fs_info))
+ if (flush != BTRFS_RESERVE_NO_FLUSH &&
+ btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
- mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+ if (delalloc_lock)
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
ret = btrfs_qgroup_reserve(root, num_bytes +
nr_extents * root->leafsize);
if (ret) {
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ spin_lock(&BTRFS_I(inode)->lock);
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
}
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode),
to_free, 0);
}
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (root->fs_info->quota_enabled) {
+ btrfs_qgroup_free(root, num_bytes +
+ nr_extents * root->leafsize);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 len;
+ bool readonly;
while (start <= end) {
+ readonly = false;
if (!cache ||
start >= cache->key.objectid + cache->key.offset) {
if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
}
start += len;
+ space_info = cache->space_info;
- spin_lock(&cache->space_info->lock);
+ spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- if (cache->ro)
- cache->space_info->bytes_readonly += len;
+ space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ }
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ if (!readonly && global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ len = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += len;
+ space_info->bytes_may_use += len;
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ spin_unlock(&space_info->lock);
}
if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
{
int index;
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
static int get_block_group_index(struct btrfs_block_group_cache *cache)
{
- return __get_block_group_index(cache->flags);
+ return __get_raid_index(cache->flags);
}
enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
/*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs)) {
- printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
- WARN_ON(1);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+ ret);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
target = get_restripe_target(root->fs_info, block_group->flags);
if (target) {
- index = __get_block_group_index(extended_to_chunk(target));
+ index = __get_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* check to make sure we can actually find a chunk with enough
* space to fit our block group in.
*/
- if (device->total_bytes > device->bytes_used + min_free) {
+ if (device->total_bytes > device->bytes_used + min_free &&
+ !device->is_tgtdev_for_dev_replace) {
ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
{
struct rb_node *node;
- if (end < start) {
- printk(KERN_ERR "btrfs end < start %llu %llu\n",
+ if (end < start)
+ WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
(unsigned long long)end,
(unsigned long long)start);
- WARN_ON(1);
- }
state->start = start;
state->end = end;
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
* the standard behavior is to write all copies in a raid setup. here we only
* want to write the one bad copy. so we do the mapping for ourselves and issue
* submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
* actually prevents the read that triggered the error from finishing.
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num)
{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
bio->bi_size = 0;
map_length = length;
- ret = btrfs_map_block(map_tree, WRITE, logical,
+ ret = btrfs_map_block(fs_info, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
u64 start = eb->start;
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
int ret = 0;
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
start, p, mirror_num);
if (ret)
break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct btrfs_mapping_tree *map_tree;
+ struct btrfs_fs_info *fs_info;
struct extent_state *state;
int num_copies;
int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
spin_unlock(&BTRFS_I(inode)->io_tree.lock);
if (state && state->start == failrec->start) {
- map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
- num_copies = btrfs_num_copies(map_tree, failrec->logical,
- failrec->len);
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ num_copies = btrfs_num_copies(fs_info, failrec->logical,
+ failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(map_tree, start, failrec->len,
+ ret = repair_io_failure(fs_info, start, failrec->len,
failrec->logical, page,
failrec->failed_mirror);
did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
* clean_io_failure() clean all those errors at once.
*/
}
- num_copies = btrfs_num_copies(
- &BTRFS_I(inode)->root->fs_info->mapping_tree,
- failrec->logical, failrec->len);
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
if (num_copies == 1) {
/*
* we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
static int __must_check submit_one_bio(int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
}
if (start + min_len > eb->len) {
- printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+ WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
"wanted %lu %lu\n", (unsigned long long)eb->start,
eb->len, start, min_len);
- WARN_ON(1);
return -EINVAL;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
struct extent_map *alloc_extent_map(void)
{
struct extent_map *em;
- em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+ em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(merge, em)) {
em->start = merge->start;
+ em->orig_start = merge->orig_start;
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
merge->in_tree = 0;
- if (merge->generation > em->generation) {
- em->mod_start = em->start;
- em->mod_len = em->len;
- em->generation = merge->generation;
- list_move(&em->list, &tree->modified_extents);
- }
+ em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+ em->mod_start = merge->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ list_move(&em->list, &tree->modified_extents);
list_del_init(&merge->list);
rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +222,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->len;
rb_erase(&merge->rb_node, &tree->map);
merge->in_tree = 0;
- if (merge->generation > em->generation) {
- em->mod_len = em->len;
- em->generation = merge->generation;
- list_move(&em->list, &tree->modified_extents);
- }
+ em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+ em->generation = max(em->generation, merge->generation);
list_del_init(&merge->list);
free_extent_map(merge);
}
}
/**
- * unpint_extent_cache - unpin an extent from the cache
+ * unpin_extent_cache - unpin an extent from the cache
* @tree: tree to unpin the extent in
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
- * @prealloc: if this is set we need to clear the prealloc flag
*
* Called after an extent has been written to disk properly. Set the generation
* to the generation that actually added the file item to the inode so we know
@@ -266,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
em->mod_start = em->start;
em->mod_len = em->len;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
prealloc = true;
- clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ clear_bit(EXTENT_FLAG_FILLING, &em->flags);
}
try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
struct extent_map {
struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
u64 mod_start;
u64 mod_len;
u64 orig_start;
+ u64 orig_block_len;
u64 block_start;
u64 block_len;
u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
return ERR_PTR(ret);
}
-
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+ int extent_type;
+ struct btrfs_file_extent_item *fi;
+ u64 len;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+ len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+ else
+ BUG();
+
+ return len;
+}
static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a8ee75cb96ee..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
#include "compat.h"
#include "volumes.h"
+static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
* If an existing record is found the defrag item you
* pass in is freed
*/
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
- goto exists;
+ return -EEXIST;
}
}
set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
- return;
+ return 0;
+}
-exists:
- kfree(defrag);
- return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+
+ if (btrfs_fs_closing(root->fs_info))
+ return 0;
+ return 1;
}
/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
u64 transid;
+ int ret;
- if (!btrfs_test_opt(root, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info))
+ if (!__need_auto_defrag(root))
return 0;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
else
transid = BTRFS_I(inode)->root->last_trans;
- defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
- __btrfs_add_inode_defrag(inode, defrag);
- else
- kfree(defrag);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
spin_unlock(&root->fs_info->defrag_inodes_lock);
return 0;
}
/*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
- u64 root, u64 ino,
- struct rb_node **next)
+void btrfs_requeue_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!__need_auto_defrag(root))
+ goto out;
+
+ /*
+ * Here we don't check the IN_DEFRAG flag, because we need merge
+ * them together.
+ */
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ if (ret)
+ goto out;
+ return;
+out:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
struct inode_defrag *entry = NULL;
struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
tmp.ino = ino;
tmp.root = root;
- p = info->defrag_inodes.rb_node;
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
else if (ret > 0)
p = parent->rb_right;
else
- return entry;
+ goto out;
}
- if (next) {
- while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
- }
- *next = parent;
+ else
+ entry = NULL;
}
- return NULL;
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
}
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ if (need_resched()) {
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ cond_resched();
+ spin_lock(&fs_info->defrag_inodes_lock);
+ }
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
struct btrfs_root *inode_root;
struct inode *inode;
- struct rb_node *n;
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
- u64 first_ino = 0;
- u64 root_objectid = 0;
int num_defrag;
- int defrag_batch = 1024;
+ /* get the inode */
+ key.objectid = defrag->root;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root)) {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return PTR_ERR(inode_root);
+ }
+
+ key.objectid = defrag->ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode)) {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return PTR_ERR(inode);
+ }
+
+ /* do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
+ range.start = defrag->last_offset;
+
+ sb_start_write(fs_info->sb);
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == BTRFS_DEFRAG_BATCH) {
+ defrag->last_offset = range.start;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+
+ iput(inode);
+ return 0;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
- spin_lock(&fs_info->defrag_inodes_lock);
while(1) {
- n = NULL;
+ if (!__need_auto_defrag(fs_info->tree_root))
+ break;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
- first_ino, &n);
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+ first_ino);
if (!defrag) {
- if (n) {
- defrag = rb_entry(n, struct inode_defrag,
- rb_node);
- } else if (root_objectid || first_ino) {
+ if (root_objectid || first_ino) {
root_objectid = 0;
first_ino = 0;
continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
}
- /* remove it from the rbtree */
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
- rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
- if (btrfs_fs_closing(fs_info))
- goto next_free;
-
- spin_unlock(&fs_info->defrag_inodes_lock);
-
- /* get the inode */
- key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = (u64)-1;
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(inode_root))
- goto next;
-
- key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
- if (IS_ERR(inode))
- goto next;
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- range.start = defrag->last_offset;
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- defrag_batch);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == defrag_batch) {
- defrag->last_offset = range.start;
- __btrfs_add_inode_defrag(inode, defrag);
- /*
- * we don't want to kfree defrag, we added it back to
- * the rbtree
- */
- defrag = NULL;
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- __btrfs_add_inode_defrag(inode, defrag);
- defrag = NULL;
- }
-
- iput(inode);
-next:
- spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
- kfree(defrag);
+ __btrfs_run_defrag_inode(fs_info, defrag);
}
- spin_unlock(&fs_info->defrag_inodes_lock);
-
atomic_dec(&fs_info->defrag_running);
/*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
else
split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
if (compressed) {
split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
} else {
split->block_len = split->len;
split->block_start = em->block_start + diff;
- split->orig_start = split->start;
+ split->orig_start = em->orig_start;
}
ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
balance_dirty_pages_ratelimited(inode->i_mapping);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
+ btrfs_btree_balance_dirty(root);
pos += copied;
num_written += copied;
@@ -1397,6 +1463,24 @@ out:
return written ? written : err;
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
ssize_t num_written = 0;
ssize_t err = 0;
size_t count, ocount;
+ bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
sb_start_write(inode->i_sb);
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- err = file_update_time(file);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
}
}
+ if (sync)
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* this will either be one more than the running transaction
* or the generation used for the next transaction if there isn't
* one running right now.
+ *
+ * We also have to set last_sub_trans to the current log transid,
+ * otherwise subsequent syncs to a file that's been synced in this
+ * transaction will appear to have already occured.
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0 || num_written == -EIOCBQUEUED) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
}
out:
+ if (sync)
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up.
*/
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
if (ret)
return ret;
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* range being left.
*/
atomic_inc(&root->log_batch);
- btrfs_wait_ordered_range(inode, start, end);
+ btrfs_wait_ordered_range(inode, start, end - start + 1);
atomic_inc(&root->log_batch);
/*
@@ -1767,6 +1866,7 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
struct btrfs_path *path;
struct btrfs_block_rsv *rsv;
struct btrfs_trans_handle *trans;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
- u64 lockstart = (offset + mask) & ~mask;
- u64 lockend = ((offset + len) & ~mask) - 1;
+ u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ u64 lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
u64 cur_offset = lockstart;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
u64 drop_end;
- unsigned long nr;
int ret = 0;
int err = 0;
- bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len) >> PAGE_CACHE_SHIFT);
+ bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
btrfs_wait_ordered_range(inode, offset, len);
mutex_lock(&inode->i_mutex);
- if (offset >= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
- return 0;
- }
-
+ /*
+ * We needn't truncate any page which is beyond the end of the file
+ * because we are sure there is no data there.
+ */
/*
* Only do this if we are in the same page and we aren't doing the
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- ret = btrfs_truncate_page(inode, offset, len, 0);
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ ret = btrfs_truncate_page(inode, offset, len, 0);
mutex_unlock(&inode->i_mutex);
return ret;
}
/* zero back part of the first page */
- ret = btrfs_truncate_page(inode, offset, 0, 0);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset, 0, 0);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
}
/* zero the front end of the last page */
- ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
}
if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
break;
}
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ out_trans:
if (!trans)
goto out_free;
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
out_free:
btrfs_free_path(path);
btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
+ int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
- alloc_start = offset & ~mask;
- alloc_end = (offset + len + mask) & ~mask;
+ alloc_start = round_down(offset, blocksize);
+ alloc_end = round_up(offset + len, blocksize);
/* Make sure we aren't being give some crap mode */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
return ret;
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
}
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
- last_byte = (last_byte + mask) & ~mask;
+ last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -2116,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
out:
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em;
@@ -2154,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
* before the position we want in case there is outstanding delalloc
* going on here.
*/
- if (origin == SEEK_HOLE && start != 0) {
+ if (whence == SEEK_HOLE && start != 0) {
if (start <= root->sectorsize)
em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
root->sectorsize, 0);
@@ -2188,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
}
}
- if (origin == SEEK_HOLE) {
+ if (whence == SEEK_HOLE) {
*offset = start;
free_extent_map(em);
break;
}
} else {
- if (origin == SEEK_DATA) {
+ if (whence == SEEK_DATA) {
if (em->block_start == EXTENT_MAP_DELALLOC) {
if (start >= inode->i_size) {
free_extent_map(em);
@@ -2231,16 +2335,16 @@ out:
return ret;
}
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret;
mutex_lock(&inode->i_mutex);
- switch (origin) {
+ switch (whence) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek(file, offset, origin);
+ offset = generic_file_llseek(file, offset, whence);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
@@ -2249,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
return -ENXIO;
}
- ret = find_desired_extent(inode, &offset, origin);
+ ret = find_desired_extent(inode, &offset, whence);
if (ret) {
mutex_unlock(&inode->i_mutex);
return ret;
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_ioctl,
#endif
};
+
+void btrfs_auto_defrag_exit(void)
+{
+ if (btrfs_inode_defrag_cachep)
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
{
- WARN_ON(io_ctl->cur);
BUG_ON(io_ctl->index >= io_ctl->num_pages);
io_ctl->page = io_ctl->pages[io_ctl->index++];
io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
* if previous extent entry covers the offset,
* we should return it instead of the bitmap entry
*/
- n = &entry->offset_index;
- while (1) {
- n = rb_prev(n);
- if (!n)
- break;
+ n = rb_prev(&entry->offset_index);
+ if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
- if (!prev->bitmap) {
- if (prev->offset + prev->bytes > offset)
- entry = prev;
- break;
- }
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ entry = prev;
}
}
return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
}
if (entry->bitmap) {
- n = &entry->offset_index;
- while (1) {
- n = rb_prev(n);
- if (!n)
- break;
+ n = rb_prev(&entry->offset_index);
+ if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
- if (!prev->bitmap) {
- if (prev->offset + prev->bytes > offset)
- return prev;
- break;
- }
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ return prev;
}
if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* some block groups are so tiny they can't be enveloped by a bitmap, so
* don't even bother to create a bitmap for this
*/
- if (BITS_PER_BITMAP * block_group->sectorsize >
- block_group->key.offset)
+ if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
return false;
return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long total_found = 0;
int ret;
- i = offset_to_bit(entry->offset, block_group->sectorsize,
+ i = offset_to_bit(entry->offset, ctl->unit,
max_t(u64, offset, entry->offset));
- want_bits = bytes_to_bits(bytes, block_group->sectorsize);
- min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, ctl->unit);
+ min_bits = bytes_to_bits(min_bytes, ctl->unit);
again:
found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
total_found += found_bits;
- if (cluster->max_size < found_bits * block_group->sectorsize)
- cluster->max_size = found_bits * block_group->sectorsize;
+ if (cluster->max_size < found_bits * ctl->unit)
+ cluster->max_size = found_bits * ctl->unit;
if (total_found < want_bits || cluster->max_size < cont1_bytes) {
i = next_zero + 1;
goto again;
}
- cluster->window_start = start * block_group->sectorsize +
- entry->offset;
+ cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
BUG_ON(ret); /* -EEXIST; Logic error */
trace_btrfs_setup_cluster(block_group, cluster,
- total_found * block_group->sectorsize, 1);
+ total_found * ctl->unit, 1);
return 0;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
* 3 items for pre-allocation
*/
trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
- ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
- trans->bytes_reserved);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ trans->bytes_reserved,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret)
goto out;
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ u64 orig_block_len, int type);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
em->block_start = ins.objectid;
em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
-static noinline int cow_file_range(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- int unlock)
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
int ret = 0;
BUG_ON(btrfs_is_free_space_inode(inode));
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, locked_page,
- EXTENT_CLEAR_UNLOCK_PAGE |
- EXTENT_CLEAR_UNLOCK |
- EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_DIRTY |
- EXTENT_SET_WRITEBACK |
- EXTENT_END_WRITEBACK);
- return PTR_ERR(trans);
- }
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
num_bytes = max(blocksize, num_bytes);
disk_num_bytes = num_bytes;
- ret = 0;
/* if this is a small write inside eof, kick off defrag */
if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
em->block_start = ins.objectid;
em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
}
- ret = 0;
out:
- btrfs_end_transaction(trans, root);
-
return ret;
+
out_unlock:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
goto out;
}
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ return PTR_ERR(trans);
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+ page_started, nr_written, unlock);
+
+ btrfs_end_transaction(trans, root);
+
+ return ret;
+}
+
/*
* work queue call back to started compression on a file and pages
*/
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 extent_offset;
u64 disk_bytenr;
u64 num_bytes;
+ u64 disk_num_bytes;
int extent_type;
int ret, err;
int type;
@@ -1228,6 +1260,8 @@ next_slot:
extent_offset = btrfs_file_extent_offset(leaf, fi);
extent_end = found_key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
+ disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
if (extent_end <= start) {
path->slots[0]++;
goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
btrfs_release_path(path);
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start,
- found_key.offset - 1, page_started,
- nr_written, 1);
+ ret = __cow_file_range(trans, inode, root, locked_page,
+ cow_start, found_key.offset - 1,
+ page_started, nr_written, 1);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error;
@@ -1298,16 +1332,21 @@ out_check:
em = alloc_extent_map();
BUG_ON(!em); /* -ENOMEM */
em->start = cur_offset;
- em->orig_start = em->start;
+ em->orig_start = found_key.offset - extent_offset;
em->len = num_bytes;
em->block_len = num_bytes;
em->block_start = disk_bytenr;
+ em->orig_block_len = disk_num_bytes;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
}
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = __cow_file_range(trans, inode, root, locked_page,
+ cow_start, end,
+ page_started, nr_written, 1);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct btrfs_mapping_tree *map_tree;
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
return 0;
length = bio->bi_size;
- map_tree = &root->fs_info->mapping_tree;
map_length = length;
- ret = btrfs_map_block(map_tree, READ, logical,
+ ret = btrfs_map_block(root->fs_info, READ, logical,
&map_length, NULL, 0);
- /* Will always return 0 or 1 with map_multi == NULL */
+ /* Will always return 0 with map_multi == NULL */
BUG_ON(ret < 0);
if (map_length < length + size)
return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ int ret;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
}
/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int ret = 0;
int skip_sum;
int metadata = 0;
+ int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
if (ret)
- return ret;
+ goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
- return btrfs_submit_compressed_read(inode, bio,
- mirror_num, bio_flags);
+ ret = btrfs_submit_compressed_read(inode, bio,
+ mirror_num,
+ bio_flags);
+ goto out;
} else if (!skip_sum) {
ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
if (ret)
- return ret;
+ goto out;
}
goto mapit;
- } else if (!skip_sum) {
+ } else if (async && !skip_sum) {
/* csum items have already been cloned */
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
bio_flags, bio_offset,
__btrfs_submit_bio_start,
__btrfs_submit_bio_done);
+ goto out;
+ } else if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ if (ret)
+ goto out;
}
mapit:
- return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+ if (ret < 0)
+ bio_endio(bio, ret);
+ return ret;
}
/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state)
{
- if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
- WARN_ON(1);
+ WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
cached_state, GFP_NOFS);
}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (!ret) {
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_update_inode_fallback(trans, root, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
}
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
goto out;
}
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
- ret = btrfs_update_inode_fallback(trans, root, inode);
- if (ret) { /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
- goto out_unlock;
- }
- } else {
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
}
ret = 0;
out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
struct inode *inode = dentry->d_inode;
int ret;
- unsigned long nr = 0;
trans = __unlink_start_trans(dir, dentry);
if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- nr = trans->blocks_used;
__unlink_end_trans(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return ret;
}
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
- unsigned long nr = 0;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!err)
btrfs_i_size_write(inode, 0);
out:
- nr = trans->blocks_used;
__unlink_end_trans(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
if (ret)
goto out;
- ret = -ENOMEM;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ ret = -ENOMEM;
goto out;
}
@@ -3550,7 +3595,6 @@ again:
goto out_unlock;
}
- ret = 0;
if (offset != PAGE_CACHE_SIZE) {
if (!len)
len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- unsigned long nr;
int ret;
trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
* inode item when doing the truncate.
*/
while (1) {
- ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+ ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ BTRFS_RESERVE_FLUSH_LIMIT);
/*
* Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- trans = btrfs_start_transaction_noflush(root, 1);
+ trans = btrfs_start_transaction_lflush(root, 1);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
trans = NULL;
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
}
btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(inode));
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
no_delete:
clear_inode(inode);
return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (S_ISREG(mode)) {
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW) ||
- (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
+ if (btrfs_test_opt(root, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
}
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
ret = btrfs_insert_dir_item(trans, root, name, name_len,
parent_inode, &key,
btrfs_inode_type(inode), index);
- if (ret == -EEXIST)
+ if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
int err;
int drop_inode = 0;
u64 objectid;
- unsigned long nr = 0;
u64 index = 0;
if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
+ err = btrfs_update_inode(trans, root, inode);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
d_instantiate(dentry, inode);
}
out_unlock:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
- int drop_inode = 0;
+ int drop_inode_on_err = 0;
int err;
- unsigned long nr = 0;
u64 objectid;
u64 index = 0;
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(inode);
goto out_unlock;
}
+ drop_inode_on_err = 1;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
+ if (err)
+ goto out_unlock;
+
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
goto out_unlock;
- }
/*
* If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- drop_inode = 1;
- else {
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- d_instantiate(dentry, inode);
- }
+ goto out_unlock;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ d_instantiate(dentry, inode);
+
out_unlock:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- if (drop_inode) {
+ if (err && drop_inode_on_err) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = old_dentry->d_inode;
u64 index;
- unsigned long nr = 0;
int err;
int drop_inode = 0;
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int drop_on_err = 0;
u64 objectid = 0;
u64 index = 0;
- unsigned long nr = 1;
/*
* 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
drop_on_err = 0;
out_fail:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -5340,6 +5384,7 @@ again:
if (start + len <= found_key.offset)
goto not_found;
em->start = start;
+ em->orig_start = start;
em->len = found_key.offset - start;
goto not_found_em;
}
@@ -5350,6 +5395,8 @@ again:
em->len = extent_end - extent_start;
em->orig_start = extent_start -
btrfs_file_extent_offset(leaf, item);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+ item);
bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
if (bytenr == 0) {
em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
em->block_start = bytenr;
- em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
- item);
+ em->block_len = em->orig_block_len;
} else {
bytenr += btrfs_file_extent_offset(leaf, item);
em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
em->start = extent_start + extent_offset;
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
- em->orig_start = EXTENT_MAP_INLINE;
+ em->orig_block_len = em->len;
+ em->orig_start = em->start;
if (compress_type) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
} else {
- printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
- WARN_ON(1);
+ WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
}
not_found:
em->start = start;
+ em->orig_start = start;
em->len = len;
not_found_em:
em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
}
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
- struct extent_map *em,
u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
int ret;
- bool insert = false;
-
- /*
- * Ok if the extent map we looked up is a hole and is for the exact
- * range we want, there is no reason to allocate a new one, however if
- * it is not right then we need to free this one and drop the cache for
- * our range.
- */
- if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
- em->len != len) {
- free_extent_map(em);
- em = NULL;
- insert = true;
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
- }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return ERR_CAST(trans);
- if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
- btrfs_add_inode_defrag(trans, inode);
-
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
goto out;
}
- if (!em) {
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- }
-
- em->start = start;
- em->orig_start = em->start;
- em->len = ins.offset;
-
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
-
- /*
- * We need to do this because if we're using the original em we searched
- * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
- */
- em->flags = 0;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
- while (insert) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
- }
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (IS_ERR(em))
+ goto out;
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
u64 len, u64 orig_start,
u64 block_start, u64 block_len,
- int type)
+ u64 orig_block_len, int type)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
em->block_len = block_len;
em->block_start = block_start;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->orig_block_len = orig_block_len;
+ em->generation = -1;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
if (type == BTRFS_ORDERED_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
do {
btrfs_drop_extent_cache(inode, em->start,
em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
+ if (!ret)
+ list_move(&em->list,
+ &em_tree->modified_extents);
write_unlock(&em_tree->lock);
} while (ret == -EEXIST);
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto must_cow;
if (can_nocow_odirect(trans, inode, start, len) == 1) {
- u64 orig_start = em->start;
+ u64 orig_start = em->orig_start;
+ u64 orig_block_len = em->orig_block_len;
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
em = create_pinned_em(inode, start, len,
orig_start,
- block_start, len, type);
+ block_start, len,
+ orig_block_len, type);
if (IS_ERR(em)) {
btrfs_end_transaction(trans, root);
goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
* it above
*/
len = bh_result->b_size;
- em = btrfs_new_extent_direct(inode, em, start, len);
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ if (async_submit)
+ async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
bio_get(bio);
if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int async_submit = 0;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_end_io = btrfs_end_dio_bio;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, READ,
+ start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
btrfs_submit_direct, 0);
}
+#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
+
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
+ int ret;
+
+ ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+ if (ret)
+ return ret;
+
return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
}
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
int ret;
int err = 0;
struct btrfs_trans_handle *trans;
- unsigned long nr;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
break;
}
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
if (ret && !err)
err = ret;
- nr = trans->blocks_used;
ret = btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
}
out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
ei->io_tree.track_uptodate = 1;
ei->io_failure_tree.track_uptodate = 1;
+ atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
+ if (btrfs_delalloc_work_cachep)
+ kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
+ btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+ sizeof(struct btrfs_delalloc_work), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_delalloc_work_cachep)
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+
+
+ /* check for collisions, even if the name isn't there */
+ ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+
+ if (ret) {
+ if (ret == -EEXIST) {
+ /* we shouldn't get
+ * eexist without a new_inode */
+ if (!new_inode) {
+ WARN_ON(1);
+ return ret;
+ }
+ } else {
+ /* maybe -EOVERFLOW */
+ return ret;
+ }
+ }
+ ret = 0;
+
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
return ret;
}
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+ struct btrfs_delalloc_work *delalloc_work;
+
+ delalloc_work = container_of(work, struct btrfs_delalloc_work,
+ work);
+ if (delalloc_work->wait)
+ btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+ else
+ filemap_flush(delalloc_work->inode->i_mapping);
+
+ if (delalloc_work->delay_iput)
+ btrfs_add_delayed_iput(delalloc_work->inode);
+ else
+ iput(delalloc_work->inode);
+ complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput)
+{
+ struct btrfs_delalloc_work *work;
+
+ work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ if (!work)
+ return NULL;
+
+ init_completion(&work->completion);
+ INIT_LIST_HEAD(&work->list);
+ work->inode = inode;
+ work->wait = wait;
+ work->delay_iput = delay_iput;
+ work->work.func = btrfs_run_delalloc_work;
+
+ return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+ wait_for_completion(&work->completion);
+ kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
/*
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
+ struct btrfs_delalloc_work *work, *next;
+ struct list_head works;
+ int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ INIT_LIST_HEAD(&works);
+
spin_lock(&root->fs_info->delalloc_lock);
while (!list_empty(head)) {
binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
list_del_init(&binode->delalloc_inodes);
spin_unlock(&root->fs_info->delalloc_lock);
if (inode) {
- filemap_flush(inode->i_mapping);
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (!work) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
}
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
- return 0;
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+ return ret;
}
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
unsigned long ptr;
struct btrfs_file_extent_item *ei;
struct extent_buffer *leaf;
- unsigned long nr = 0;
name_len = strlen(symname) + 1;
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
out_unlock:
if (!err)
d_instantiate(dentry, inode);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
return err;
}
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->len = ins.offset;
em->block_start = ins.objectid;
em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5b3429ab8ec1..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
#include "backref.h"
#include "rcu-string.h"
#include "send.h"
+#include "dev-replace.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
}
- if (flags & BTRFS_INODE_NODATACOW)
+ if (flags & BTRFS_INODE_NODATACOW) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ }
btrfs_update_iflags(inode);
}
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
ret = btrfs_commit_transaction(trans,
root->fs_info->extent_root);
}
- if (ret)
+ if (ret) {
+ /* cleanup_transaction has freed this for us */
+ if (trans->aborted)
+ pending_snapshot = NULL;
goto fail;
+ }
ret = pending_snapshot->error;
if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (error)
goto out_dput;
+ /*
+ * even if this name doesn't exist, we may get hash collisions.
+ * check for them now when we can safely fail
+ */
+ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+ dir->i_ino, name,
+ namelen);
+ if (error)
+ goto out_dput;
+
down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1293,12 +1311,13 @@ out_ra:
return ret;
}
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
u64 new_size;
u64 old_size;
u64 devid = 1;
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ return -EINPROGRESS;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
- device = btrfs_find_device(root, devid, NULL, NULL);
+ device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
}
}
+ if (device->is_tgtdev_for_dev_replace) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
old_size = device->total_bytes;
if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
btrfs_commit_transaction(trans, root);
} else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
- }
+ } /* equal, nothing need to do */
out_free:
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ mnt_drop_write_file(file);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
if (btrfs_root_readonly(root))
return -EROFS;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ return -EINPROGRESS;
+ }
ret = mnt_want_write_file(file);
- if (ret)
+ if (ret) {
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+ 0);
return ret;
+ }
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
}
out:
mnt_drop_write_file(file);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ return -EINPROGRESS;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
- mutex_lock(&root->fs_info->volume_mutex);
- if (root->fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: balance in progress\n");
- ret = -EINVAL;
- goto out;
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ mnt_drop_write_file(file);
+ return -EINPROGRESS;
}
+ mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
+ mnt_drop_write_file(file);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
s_uuid = di_args->uuid;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_disk_key disk_key;
u64 objectid = 0;
u64 dir_id;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&objectid, argp, sizeof(objectid)))
- return -EFAULT;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+ ret = -EFAULT;
+ goto out;
+ }
if (!objectid)
objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
location.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
- if (IS_ERR(new_root))
- return PTR_ERR(new_root);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ goto out;
+ }
- if (btrfs_root_refs(&new_root->root_item) == 0)
- return -ENOENT;
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_free_path(path);
- return PTR_ERR(trans);
+ ret = PTR_ERR(trans);
+ goto out;
}
dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_end_transaction(trans, root);
printk(KERN_ERR "Umm, you don't have the default dir item, "
"this isn't going to work\n");
- return -ENOENT;
+ ret = -ENOENT;
+ goto out;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans, root);
-
- return 0;
+out:
+ mnt_drop_write_file(file);
+ return ret;
}
void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
return 0;
}
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ void __user *argp)
{
- struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
struct btrfs_trans_handle *trans;
u64 transid;
int ret;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ return PTR_ERR(trans);
+
+ /* No running transaction, don't bother */
+ transid = root->fs_info->last_trans_committed;
+ goto out;
+ }
transid = trans->transid;
ret = btrfs_commit_transaction_async(trans, root, 0);
if (ret) {
btrfs_end_transaction(trans, root);
return ret;
}
-
+out:
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
return -EFAULT;
return 0;
}
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+ void __user *argp)
{
- struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
u64 transid;
if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
return btrfs_wait_for_commit(root, transid);
}
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- int ret;
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_scrub_args *sa;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
- &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+ if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+ 0);
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
+ if (!(sa->flags & BTRFS_SCRUB_READONLY))
+ mnt_drop_write_file(file);
+out:
kfree(sa);
return ret;
}
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_scrub_cancel(root);
+ return btrfs_scrub_cancel(root->fs_info);
}
static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
return ret;
}
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_dev_replace_args *p;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ p = memdup_user(arg, sizeof(*p));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ switch (p->cmd) {
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+ if (atomic_xchg(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINPROGRESS;
+ } else {
+ ret = btrfs_dev_replace_start(root, p);
+ atomic_set(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 0);
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+ btrfs_dev_replace_status(root->fs_info, p);
+ ret = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+ ret = btrfs_dev_replace_cancel(root->fs_info, p);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(arg, p, sizeof(*p)))
+ ret = -EFAULT;
+
+ kfree(p);
+ return ret;
+}
+
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
int ret;
+ int need_to_clear_lock = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bargs = NULL;
}
- if (fs_info->balance_ctl) {
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
ret = -EINPROGRESS;
goto out_bargs;
}
+ need_to_clear_lock = 1;
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
out_bargs:
kfree(bargs);
out:
+ if (need_to_clear_lock)
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+ 0);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
return ret;
}
-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_quota_ctl_args *sa;
struct btrfs_trans_handle *trans = NULL;
int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa))
- return PTR_ERR(sa);
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
if (err && !ret)
ret = err;
}
-
out:
kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa))
- return PTR_ERR(sa);
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
out:
kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_qgroup_create_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa))
- return PTR_ERR(sa);
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
out:
kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_qgroup_limit_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa))
- return PTR_ERR(sa);
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
out:
kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
return ret;
}
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEFRAG_RANGE:
return btrfs_ioctl_defrag(file, argp);
case BTRFS_IOC_RESIZE:
- return btrfs_ioctl_resize(root, argp);
+ return btrfs_ioctl_resize(file, argp);
case BTRFS_IOC_ADD_DEV:
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
- return btrfs_ioctl_rm_dev(root, argp);
+ return btrfs_ioctl_rm_dev(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(root, argp);
case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
btrfs_sync_fs(file->f_dentry->d_sb, 1);
return 0;
case BTRFS_IOC_START_SYNC:
- return btrfs_ioctl_start_sync(file, argp);
+ return btrfs_ioctl_start_sync(root, argp);
case BTRFS_IOC_WAIT_SYNC:
- return btrfs_ioctl_wait_sync(file, argp);
+ return btrfs_ioctl_wait_sync(root, argp);
case BTRFS_IOC_SCRUB:
- return btrfs_ioctl_scrub(root, argp);
+ return btrfs_ioctl_scrub(file, argp);
case BTRFS_IOC_SCRUB_CANCEL:
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(root, argp);
case BTRFS_IOC_QUOTA_CTL:
- return btrfs_ioctl_quota_ctl(root, argp);
+ return btrfs_ioctl_quota_ctl(file, argp);
case BTRFS_IOC_QGROUP_ASSIGN:
- return btrfs_ioctl_qgroup_assign(root, argp);
+ return btrfs_ioctl_qgroup_assign(file, argp);
case BTRFS_IOC_QGROUP_CREATE:
- return btrfs_ioctl_qgroup_create(root, argp);
+ return btrfs_ioctl_qgroup_create(file, argp);
case BTRFS_IOC_QGROUP_LIMIT:
- return btrfs_ioctl_qgroup_limit(root, argp);
+ return btrfs_ioctl_qgroup_limit(file, argp);
+ case BTRFS_IOC_DEV_REPLACE:
+ return btrfs_ioctl_dev_replace(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
char name[BTRFS_PATH_NAME_MAX + 1];
};
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
};
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+struct btrfs_ioctl_dev_replace_start_params {
+ __u64 srcdevid; /* in, if 0, use srcdev_name instead */
+ __u64 cont_reading_from_srcdev_mode; /* in, see #define
+ * above */
+ __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
+ __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
+struct btrfs_ioctl_dev_replace_status_params {
+ __u64 replace_state; /* out, see #define above */
+ __u64 progress_1000; /* out, 0 <= x <= 1000 */
+ __u64 time_started; /* out, seconds since 1-Jan-1970 */
+ __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
+ __u64 num_write_errors; /* out */
+ __u64 num_uncorrectable_read_errors; /* out */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
+struct btrfs_ioctl_dev_replace_args {
+ __u64 cmd; /* in */
+ __u64 result; /* out */
+
+ union {
+ struct btrfs_ioctl_dev_replace_start_params start;
+ struct btrfs_ioctl_dev_replace_status_params status;
+ }; /* in/out */
+
+ __u64 spare[64];
+};
+
struct btrfs_ioctl_dev_info_args {
__u64 devid; /* in/out */
__u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
struct btrfs_ioctl_qgroup_limit_args)
#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+ struct btrfs_ioctl_dev_replace_args)
+
#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
+ INIT_LIST_HEAD(&entry->work_list);
+ init_completion(&entry->completion);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
wake_up(&entry->wait);
}
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+ btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ complete(&ordered->completion);
+}
+
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
- struct list_head splice;
+ struct list_head splice, works;
struct list_head *cur;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *ordered, *next;
struct inode *inode;
INIT_LIST_HEAD(&splice);
+ INIT_LIST_HEAD(&works);
spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
spin_unlock(&root->fs_info->ordered_extent_lock);
if (inode) {
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &ordered->flush_work);
} else {
btrfs_put_ordered_extent(ordered);
}
+ cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ list_for_each_entry_safe(ordered, next, &works, work_list) {
+ list_del_init(&ordered->work_list);
+ wait_for_completion(&ordered->completion);
+
+ inode = ordered->inode;
+ btrfs_put_ordered_extent(ordered);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+
+ cond_resched();
+ }
}
/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
struct list_head splice;
+ struct list_head works;
+ struct btrfs_delalloc_work *work, *next;
+ int ret = 0;
INIT_LIST_HEAD(&splice);
+ INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
list_splice_init(&root->fs_info->ordered_operations, &splice);
while (!list_empty(&splice)) {
+
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
@@ -549,15 +579,26 @@ again:
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&root->fs_info->ordered_operations);
}
+
+ if (!inode)
+ continue;
spin_unlock(&root->fs_info->ordered_extent_lock);
- if (inode) {
- if (wait)
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- else
- filemap_flush(inode->i_mapping);
- btrfs_add_delayed_iput(inode);
+ work = btrfs_alloc_delalloc_work(inode, wait, 1);
+ if (!work) {
+ if (list_empty(&BTRFS_I(inode)->ordered_operations))
+ list_add_tail(&btrfs_inode->ordered_operations,
+ &splice);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_splice_tail(&splice,
+ &root->fs_info->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
goto again;
spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ return ret;
}
/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
- int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
- found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
if (last_mod < root->fs_info->last_trans_committed)
return;
- /*
- * the transaction is already committing. Just start the IO and
- * don't bother with all of this list nonsense
- */
- if (trans && root->fs_info->running_transaction->blocked) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- return;
- }
-
spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
NULL);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
+
return 0;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
* has done its due diligence in updating
* the isize. */
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
struct list_head root_extent_list;
struct btrfs_work work;
-};
+ struct completion completion;
+ struct btrfs_work flush_work;
+ struct list_head work_list;
+};
/*
* calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
case BTRFS_DEV_STATS_KEY:
printk(KERN_INFO "\t\tdevice stats\n");
break;
+ case BTRFS_DEV_REPLACE_KEY:
+ printk(KERN_INFO "\t\tdev replace\n");
+ break;
};
}
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
#include "volumes.h"
#include "disk-io.h"
#include "transaction.h"
+#include "dev-replace.h"
#undef DEBUG
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ int dev_replace_is_ongoing;
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
* map block
*/
length = blocksize;
- ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+ ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0);
if (ret || !bbio || length < blocksize)
goto error;
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
}
/* insert extent in reada_tree + all per-device trees, all or nothing */
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
BUG_ON(!re_exist);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
prev_dev = NULL;
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+ &fs_info->dev_replace);
for (i = 0; i < nzones; ++i) {
dev = bbio->stripes[i].dev;
if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
*/
continue;
}
+ if (!dev->bdev) {
+ /* cannot read ahead on missing device */
+ continue;
+ }
+ if (dev_replace_is_ongoing &&
+ dev == fs_info->dev_replace.tgtdev) {
+ /*
+ * as this device is selected for reading only as
+ * a last resort, skip it for read ahead.
+ */
+ continue;
+ }
prev_dev = dev;
ret = radix_tree_insert(&dev->reada_extents, index, re);
if (ret) {
while (--i >= 0) {
dev = bbio->stripes[i].dev;
BUG_ON(dev == NULL);
+ /* ignore whether the entry was inserted */
radix_tree_delete(&dev->reada_extents, index);
}
BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
goto error;
}
}
spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
kfree(bbio);
return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- reada_add_block(rc, start, &max_key, level, generation);
+ if (reada_add_block(rc, start, &max_key, level, generation)) {
+ kfree(rc);
+ return ERR_PTR(-ENOMEM);
+ }
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
- unsigned long nr;
int level;
int max_level;
int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(IS_ERR(trans));
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
BUG_ON(ret != -EAGAIN);
ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path->slots[level]);
root_item->drop_level = level;
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
btrfs_update_reloc_root(trans, root);
}
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
}
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct btrfs_path *path;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
- unsigned long nr;
int ret = 0;
if (inode)
@@ -3293,9 +3292,8 @@ truncate:
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
btrfs_free_path(path);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
out:
iput(inode);
return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
* is no reservation in transaction handle.
*/
ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
- rc->extent_root->nodesize * 256);
+ rc->extent_root->nodesize * 256,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret)
return ret;
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
- unsigned long nr;
u64 flags;
u32 item_size;
int ret;
@@ -3828,9 +3826,8 @@ restart:
ret = btrfs_commit_transaction(trans, rc->extent_root);
BUG_ON(ret);
} else {
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root, nr);
+ btrfs_btree_balance_dirty(rc->extent_root);
}
trans = NULL;
@@ -3860,9 +3857,8 @@ restart:
GFP_NOFS);
if (trans) {
- nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root, nr);
+ btrfs_btree_balance_dirty(rc->extent_root);
}
if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key key;
- unsigned long nr;
u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
int err = 0;
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
err = btrfs_orphan_add(trans, inode);
out:
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty(root);
if (err) {
if (inode)
iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root_item *item = &root->root_item;
struct timespec ct = CURRENT_TIME;
- spin_lock(&root->root_times_lock);
+ spin_lock(&root->root_item_lock);
item->ctransid = cpu_to_le64(trans->transid);
item->ctime.sec = cpu_to_le64(ct.tv_sec);
item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
- spin_unlock(&root->root_times_lock);
+ spin_unlock(&root->root_item_lock);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 STRATO. All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
+#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -42,10 +43,23 @@
*/
struct scrub_block;
-struct scrub_dev;
+struct scrub_ctx;
-#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
-#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
u64 generation;
u64 logical;
u64 physical;
+ u64 physical_for_dev_replace;
+ atomic_t ref_count;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
struct scrub_bio {
int index;
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
+ struct btrfs_device *dev;
struct bio *bio;
int err;
u64 logical;
u64 physical;
- struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+ struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+ struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
int page_count;
int next_free;
struct btrfs_work work;
};
struct scrub_block {
- struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
atomic_t ref_count; /* free mem on transition to zero */
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
struct {
unsigned int header_error:1;
unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
};
};
-struct scrub_dev {
- struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
- struct btrfs_device *dev;
+struct scrub_wr_ctx {
+ struct scrub_bio *wr_curr_bio;
+ struct btrfs_device *tgtdev;
+ int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+ atomic_t flush_all_writes;
+ struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+ struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
+ struct btrfs_root *dev_root;
int first_free;
int curr;
- atomic_t in_flight;
- atomic_t fixup_cnt;
+ atomic_t bios_in_flight;
+ atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
u16 csum_size;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+ int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
u32 leafsize;
+
+ int is_dev_replace;
+ struct scrub_wr_ctx wr_ctx;
+
/*
* statistics
*/
@@ -116,13 +149,23 @@ struct scrub_dev {
};
struct scrub_fixup_nodatasum {
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
+ struct btrfs_device *dev;
u64 logical;
struct btrfs_root *root;
struct btrfs_work work;
int mirror_num;
};
+struct scrub_copy_nocow_ctx {
+ struct scrub_ctx *sctx;
+ u64 logical;
+ u64 len;
+ int mirror_num;
+ u64 physical_for_dev_replace;
+ struct btrfs_work work;
+};
+
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
};
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
- struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+ struct btrfs_fs_info *fs_info,
+ struct scrub_block *original_sblock,
u64 length, u64 logical,
- struct scrub_block *sblock);
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+ int page_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
- struct scrub_page *spage);
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, int mirror_num,
- u8 *csum, int force);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum, int force,
+ u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio, int err);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u64 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+ struct scrub_wr_ctx *wr_ctx,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev,
+ int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+ u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+ void *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+ atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+ atomic_dec(&sctx->bios_in_flight);
+ wake_up(&sctx->list_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+ /*
+ * increment scrubs_running to prevent cancel requests from
+ * completing as long as a worker is running. we must also
+ * increment scrubs_paused to prevent deadlocking on pause
+ * requests used for transactions commits (as the worker uses a
+ * transaction context). it is safe to regard the worker
+ * as paused for all matters practical. effectively, we only
+ * avoid cancellation requests from completing.
+ */
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrubs_running);
+ atomic_inc(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ atomic_inc(&sctx->workers_pending);
+}
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
-static void scrub_free_csums(struct scrub_dev *sdev)
+ /*
+ * see scrub_pending_trans_workers_inc() why we're pretending
+ * to be paused in the scrub counters
+ */
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_dec(&fs_info->scrubs_running);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ atomic_dec(&sctx->workers_pending);
+ wake_up(&fs_info->scrub_pause_wait);
+ wake_up(&sctx->list_wait);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
{
- while (!list_empty(&sdev->csum_list)) {
+ while (!list_empty(&sctx->csum_list)) {
struct btrfs_ordered_sum *sum;
- sum = list_first_entry(&sdev->csum_list,
+ sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
list_del(&sum->list);
kfree(sum);
}
}
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
int i;
- if (!sdev)
+ if (!sctx)
return;
+ scrub_free_wr_ctx(&sctx->wr_ctx);
+
/* this can happen when scrub is cancelled */
- if (sdev->curr != -1) {
- struct scrub_bio *sbio = sdev->bios[sdev->curr];
+ if (sctx->curr != -1) {
+ struct scrub_bio *sbio = sctx->bios[sctx->curr];
for (i = 0; i < sbio->page_count; i++) {
- BUG_ON(!sbio->pagev[i]);
- BUG_ON(!sbio->pagev[i]->page);
+ WARN_ON(!sbio->pagev[i]->page);
scrub_block_put(sbio->pagev[i]->sblock);
}
bio_put(sbio->bio);
}
- for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
- struct scrub_bio *sbio = sdev->bios[i];
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+ struct scrub_bio *sbio = sctx->bios[i];
if (!sbio)
break;
kfree(sbio);
}
- scrub_free_csums(sdev);
- kfree(sdev);
+ scrub_free_csums(sctx);
+ kfree(sctx);
}
static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
- int pages_per_bio;
+ int pages_per_rd_bio;
+ int ret;
- pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
- bio_get_nr_vecs(dev->bdev));
- sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
- if (!sdev)
+ /*
+ * the setting of pages_per_rd_bio is correct for scrub but might
+ * be wrong for the dev_replace code where we might read from
+ * different devices in the initial huge bios. However, that
+ * code is able to correctly handle the case when adding a page
+ * to a bio fails.
+ */
+ if (dev->bdev)
+ pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+ bio_get_nr_vecs(dev->bdev));
+ else
+ pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+ sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+ if (!sctx)
goto nomem;
- sdev->dev = dev;
- sdev->pages_per_bio = pages_per_bio;
- sdev->curr = -1;
- for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+ sctx->is_dev_replace = is_dev_replace;
+ sctx->pages_per_rd_bio = pages_per_rd_bio;
+ sctx->curr = -1;
+ sctx->dev_root = dev->dev_root;
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
if (!sbio)
goto nomem;
- sdev->bios[i] = sbio;
+ sctx->bios[i] = sbio;
sbio->index = i;
- sbio->sdev = sdev;
+ sbio->sctx = sctx;
sbio->page_count = 0;
sbio->work.func = scrub_bio_end_io_worker;
- if (i != SCRUB_BIOS_PER_DEV-1)
- sdev->bios[i]->next_free = i + 1;
+ if (i != SCRUB_BIOS_PER_SCTX - 1)
+ sctx->bios[i]->next_free = i + 1;
else
- sdev->bios[i]->next_free = -1;
- }
- sdev->first_free = 0;
- sdev->nodesize = dev->dev_root->nodesize;
- sdev->leafsize = dev->dev_root->leafsize;
- sdev->sectorsize = dev->dev_root->sectorsize;
- atomic_set(&sdev->in_flight, 0);
- atomic_set(&sdev->fixup_cnt, 0);
- atomic_set(&sdev->cancel_req, 0);
- sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
- INIT_LIST_HEAD(&sdev->csum_list);
-
- spin_lock_init(&sdev->list_lock);
- spin_lock_init(&sdev->stat_lock);
- init_waitqueue_head(&sdev->list_wait);
- return sdev;
+ sctx->bios[i]->next_free = -1;
+ }
+ sctx->first_free = 0;
+ sctx->nodesize = dev->dev_root->nodesize;
+ sctx->leafsize = dev->dev_root->leafsize;
+ sctx->sectorsize = dev->dev_root->sectorsize;
+ atomic_set(&sctx->bios_in_flight, 0);
+ atomic_set(&sctx->workers_pending, 0);
+ atomic_set(&sctx->cancel_req, 0);
+ sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ INIT_LIST_HEAD(&sctx->csum_list);
+
+ spin_lock_init(&sctx->list_lock);
+ spin_lock_init(&sctx->stat_lock);
+ init_waitqueue_head(&sctx->list_wait);
+
+ ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+ fs_info->dev_replace.tgtdev, is_dev_replace);
+ if (ret) {
+ scrub_free_ctx(sctx);
+ return ERR_PTR(ret);
+ }
+ return sctx;
nomem:
- scrub_free_dev(sdev);
+ scrub_free_ctx(sctx);
return ERR_PTR(-ENOMEM);
}
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+ void *warn_ctx)
{
u64 isize;
u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
int i;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
- struct scrub_warning *swarn = ctx;
+ struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
- struct btrfs_device *dev = sblock->sdev->dev;
- struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ struct btrfs_device *dev;
+ struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
const int bufsize = 4096;
int ret;
+ WARN_ON(sblock->page_count < 1);
+ dev = sblock->pagev[0]->dev;
+ fs_info = sblock->sctx->dev_root->fs_info;
+
path = btrfs_alloc_path();
swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
- BUG_ON(sblock->page_count < 1);
- swarn.sector = (sblock->pagev[0].physical) >> 9;
- swarn.logical = sblock->pagev[0].logical;
+ swarn.sector = (sblock->pagev[0]->physical) >> 9;
+ swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
- swarn.dev = dev;
+ swarn.dev = NULL;
swarn.msg_bufsize = bufsize;
swarn.scratch_bufsize = bufsize;
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
} while (ret != 1);
} else {
swarn.path = path;
+ swarn.dev = dev;
iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1,
scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
kfree(swarn.msg_buf);
}
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
{
struct page *page = NULL;
unsigned long index;
- struct scrub_fixup_nodatasum *fixup = ctx;
+ struct scrub_fixup_nodatasum *fixup = fixup_ctx;
int ret;
int corrected = 0;
struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
}
if (PageUptodate(page)) {
- struct btrfs_mapping_tree *map_tree;
+ struct btrfs_fs_info *fs_info;
if (PageDirty(page)) {
/*
* we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
ret = -EIO;
goto out;
}
- map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
- ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
fixup->logical, page,
fixup->mirror_num);
unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
{
int ret;
struct scrub_fixup_nodatasum *fixup;
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
int uncorrectable = 0;
fixup = container_of(work, struct scrub_fixup_nodatasum, work);
- sdev = fixup->sdev;
+ sctx = fixup->sctx;
fs_info = fixup->root->fs_info;
path = btrfs_alloc_path();
if (!path) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.malloc_errors;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.malloc_errors;
+ spin_unlock(&sctx->stat_lock);
uncorrectable = 1;
goto out;
}
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
}
WARN_ON(ret != 1);
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.corrected_errors;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.corrected_errors;
+ spin_unlock(&sctx->stat_lock);
out:
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, fixup->root);
if (uncorrectable) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.uncorrectable_errors;
- spin_unlock(&sdev->stat_lock);
-
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.uncorrectable_errors;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->fs_info->dev_replace.
+ num_uncorrectable_read_errors);
printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
(unsigned long long)fixup->logical,
- rcu_str_deref(sdev->dev->name));
+ rcu_str_deref(fixup->dev->name));
}
btrfs_free_path(path);
kfree(fixup);
- /* see caller why we're pretending to be paused in the scrub counters */
- mutex_lock(&fs_info->scrub_lock);
- atomic_dec(&fs_info->scrubs_running);
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- atomic_dec(&sdev->fixup_cnt);
- wake_up(&fs_info->scrub_pause_wait);
- wake_up(&sdev->list_wait);
+ scrub_pending_trans_workers_dec(sctx);
}
/*
@@ -614,7 +764,8 @@ out:
*/
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
- struct scrub_dev *sdev = sblock_to_check->sdev;
+ struct scrub_ctx *sctx = sblock_to_check->sctx;
+ struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
- fs_info = sdev->dev->dev_root->fs_info;
+ fs_info = sctx->dev_root->fs_info;
+ if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ /*
+ * if we find an error in a super block, we just report it.
+ * They will get written with the next transaction commit
+ * anyway
+ */
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+ return 0;
+ }
length = sblock_to_check->page_count * PAGE_SIZE;
- logical = sblock_to_check->pagev[0].logical;
- generation = sblock_to_check->pagev[0].generation;
- BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0].flags &
+ logical = sblock_to_check->pagev[0]->logical;
+ generation = sblock_to_check->pagev[0]->generation;
+ BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+ failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+ is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0].have_csum;
- csum = sblock_to_check->pagev[0].csum;
+ have_csum = sblock_to_check->pagev[0]->have_csum;
+ csum = sblock_to_check->pagev[0]->csum;
+ dev = sblock_to_check->pagev[0]->dev;
+
+ if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+ sblocks_for_recheck = NULL;
+ goto nodatasum_case;
+ }
/*
* read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sizeof(*sblocks_for_recheck),
GFP_NOFS);
if (!sblocks_for_recheck) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.malloc_errors++;
- sdev->stat.read_errors++;
- sdev->stat.uncorrectable_errors++;
- spin_unlock(&sdev->stat_lock);
- btrfs_dev_stat_inc_and_print(sdev->dev,
- BTRFS_DEV_STAT_READ_ERRS);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
/* setup the context, map the logical blocks and alloc the pages */
- ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+ ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
logical, sblocks_for_recheck);
if (ret) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.read_errors++;
- sdev->stat.uncorrectable_errors++;
- spin_unlock(&sdev->stat_lock);
- btrfs_dev_stat_inc_and_print(sdev->dev,
- BTRFS_DEV_STAT_READ_ERRS);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sdev->csum_size);
- if (ret) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.read_errors++;
- sdev->stat.uncorrectable_errors++;
- spin_unlock(&sdev->stat_lock);
- btrfs_dev_stat_inc_and_print(sdev->dev,
- BTRFS_DEV_STAT_READ_ERRS);
- goto out;
- }
+ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+ csum, generation, sctx->csum_size);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* different bio (usually one of the two latter cases is
* the cause)
*/
- spin_lock(&sdev->stat_lock);
- sdev->stat.unverified_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.unverified_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock_bad);
goto out;
}
if (!sblock_bad->no_io_error_seen) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.read_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("i/o error", sblock_to_check);
- btrfs_dev_stat_inc_and_print(sdev->dev,
- BTRFS_DEV_STAT_READ_ERRS);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.csum_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.csum_errors++;
+ spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum error", sblock_to_check);
- btrfs_dev_stat_inc_and_print(sdev->dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.verify_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.verify_errors++;
+ spin_unlock(&sctx->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
- btrfs_dev_stat_inc_and_print(sdev->dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sdev->dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
- if (sdev->readonly)
+ if (sctx->readonly && !sctx->is_dev_replace)
goto did_not_correct_error;
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
+nodatasum_case:
+ WARN_ON(sctx->is_dev_replace);
+
/*
* !is_metadata and !have_csum, this means that the data
* might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
if (!fixup_nodatasum)
goto did_not_correct_error;
- fixup_nodatasum->sdev = sdev;
+ fixup_nodatasum->sctx = sctx;
+ fixup_nodatasum->dev = dev;
fixup_nodatasum->logical = logical;
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
- /*
- * increment scrubs_running to prevent cancel requests from
- * completing as long as a fixup worker is running. we must also
- * increment scrubs_paused to prevent deadlocking on pause
- * requests used for transactions commits (as the worker uses a
- * transaction context). it is safe to regard the fixup worker
- * as paused for all matters practical. effectively, we only
- * avoid cancellation requests from completing.
- */
- mutex_lock(&fs_info->scrub_lock);
- atomic_inc(&fs_info->scrubs_running);
- atomic_inc(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- atomic_inc(&sdev->fixup_cnt);
+ scrub_pending_trans_workers_inc(sctx);
fixup_nodatasum->work.func = scrub_fixup_nodatasum;
btrfs_queue_worker(&fs_info->scrub_workers,
&fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/*
* now build and submit the bios for the other mirrors, check
- * checksums
- */
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- if (mirror_index == failed_mirror_index)
- continue;
-
- /* build and submit the bios, check checksums */
- ret = scrub_recheck_block(fs_info,
- sblocks_for_recheck + mirror_index,
- is_metadata, have_csum, csum,
- generation, sdev->csum_size);
- if (ret)
- goto did_not_correct_error;
- }
-
- /*
- * first try to pick the mirror which is completely without I/O
+ * checksums.
+ * First try to pick the mirror which is completely without I/O
* errors and also does not have a checksum error.
* If one is found, and if a checksum is present, the full block
* that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
mirror_index++) {
- struct scrub_block *sblock_other = sblocks_for_recheck +
- mirror_index;
+ struct scrub_block *sblock_other;
+
+ if (mirror_index == failed_mirror_index)
+ continue;
+ sblock_other = sblocks_for_recheck + mirror_index;
+
+ /* build and submit the bios, check checksums */
+ scrub_recheck_block(fs_info, sblock_other, is_metadata,
+ have_csum, csum, generation,
+ sctx->csum_size);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
sblock_other->no_io_error_seen) {
- int force_write = is_metadata || have_csum;
-
- ret = scrub_repair_block_from_good_copy(sblock_bad,
- sblock_other,
- force_write);
+ if (sctx->is_dev_replace) {
+ scrub_write_block_to_dev_replace(sblock_other);
+ } else {
+ int force_write = is_metadata || have_csum;
+
+ ret = scrub_repair_block_from_good_copy(
+ sblock_bad, sblock_other,
+ force_write);
+ }
if (0 == ret)
goto corrected_error;
}
}
/*
- * in case of I/O errors in the area that is supposed to be
+ * for dev_replace, pick good pages and write to the target device.
+ */
+ if (sctx->is_dev_replace) {
+ success = 1;
+ for (page_num = 0; page_num < sblock_bad->page_count;
+ page_num++) {
+ int sub_success;
+
+ sub_success = 0;
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other =
+ sblocks_for_recheck + mirror_index;
+ struct scrub_page *page_other =
+ sblock_other->pagev[page_num];
+
+ if (!page_other->io_error) {
+ ret = scrub_write_page_to_dev_replace(
+ sblock_other, page_num);
+ if (ret == 0) {
+ /* succeeded for this page */
+ sub_success = 1;
+ break;
+ } else {
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->
+ fs_info->dev_replace.
+ num_write_errors);
+ }
+ }
+ }
+
+ if (!sub_success) {
+ /*
+ * did not find a mirror to fetch the page
+ * from. scrub_write_page_to_dev_replace()
+ * handles this case (page->io_error), by
+ * filling the block with zeros before
+ * submitting the write request
+ */
+ success = 0;
+ ret = scrub_write_page_to_dev_replace(
+ sblock_bad, page_num);
+ if (ret)
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->fs_info->
+ dev_replace.num_write_errors);
+ }
+ }
+
+ goto out;
+ }
+
+ /*
+ * for regular scrub, repair those pages that are errored.
+ * In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those pages.
* Select the good pages from mirrors to rewrite bad pages from
* the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
- struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
if (!page_bad->io_error)
continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
mirror_index++) {
struct scrub_block *sblock_other = sblocks_for_recheck +
mirror_index;
- struct scrub_page *page_other = sblock_other->pagev +
- page_num;
+ struct scrub_page *page_other = sblock_other->pagev[
+ page_num];
if (!page_other->io_error) {
ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* is verified, but most likely the data comes out
* of the page cache.
*/
- ret = scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sdev->csum_size);
- if (!ret && !sblock_bad->header_error &&
+ scrub_recheck_block(fs_info, sblock_bad,
+ is_metadata, have_csum, csum,
+ generation, sctx->csum_size);
+ if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto did_not_correct_error;
} else {
corrected_error:
- spin_lock(&sdev->stat_lock);
- sdev->stat.corrected_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.corrected_errors++;
+ spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"btrfs: fixed up error at logical %llu on dev %s\n",
(unsigned long long)logical,
- rcu_str_deref(sdev->dev->name));
+ rcu_str_deref(dev->name));
}
} else {
did_not_correct_error:
- spin_lock(&sdev->stat_lock);
- sdev->stat.uncorrectable_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
(unsigned long long)logical,
- rcu_str_deref(sdev->dev->name));
+ rcu_str_deref(dev->name));
}
out:
@@ -966,11 +1166,11 @@ out:
mirror_index;
int page_index;
- for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
- page_index++)
- if (sblock->pagev[page_index].page)
- __free_page(
- sblock->pagev[page_index].page);
+ for (page_index = 0; page_index < sblock->page_count;
+ page_index++) {
+ sblock->pagev[page_index]->sblock = NULL;
+ scrub_page_put(sblock->pagev[page_index]);
+ }
}
kfree(sblocks_for_recheck);
}
@@ -978,8 +1178,9 @@ out:
return 0;
}
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
- struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+ struct btrfs_fs_info *fs_info,
+ struct scrub_block *original_sblock,
u64 length, u64 logical,
struct scrub_block *sblocks_for_recheck)
{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
int ret;
/*
- * note: the three members sdev, ref_count and outstanding_pages
+ * note: the two members ref_count and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
- ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
- &bbio, 0);
+ ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < sublen) {
kfree(bbio);
return -EIO;
}
- BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+ BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
mirror_index++) {
struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
continue;
sblock = sblocks_for_recheck + mirror_index;
- page = sblock->pagev + page_index;
+ sblock->sctx = sctx;
+ page = kzalloc(sizeof(*page), GFP_NOFS);
+ if (!page) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ kfree(bbio);
+ return -ENOMEM;
+ }
+ scrub_page_get(page);
+ sblock->pagev[page_index] = page;
page->logical = logical;
page->physical = bbio->stripes[mirror_index].physical;
+ BUG_ON(page_index >= original_sblock->page_count);
+ page->physical_for_dev_replace =
+ original_sblock->pagev[page_index]->
+ physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
- page->page = alloc_page(GFP_NOFS);
- if (!page->page) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.malloc_errors++;
- spin_unlock(&sdev->stat_lock);
- kfree(bbio);
- return -ENOMEM;
- }
sblock->page_count++;
+ page->page = alloc_page(GFP_NOFS);
+ if (!page->page)
+ goto leave_nomem;
}
kfree(bbio);
length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
* to take those pages that are not errored from all the mirrors so that
* the pages that are errored in the just handled mirror can be repaired.
*/
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size)
{
int page_num;
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
- int ret;
- struct scrub_page *page = sblock->pagev + page_num;
+ struct scrub_page *page = sblock->pagev[page_num];
DECLARE_COMPLETION_ONSTACK(complete);
if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue;
}
- BUG_ON(!page->page);
+ WARN_ON(!page->page);
bio = bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ if (!bio) {
+ page->io_error = 1;
+ sblock->no_io_error_seen = 0;
+ continue;
+ }
bio->bi_bdev = page->dev->bdev;
bio->bi_sector = page->physical >> 9;
bio->bi_end_io = scrub_complete_bio_end_io;
bio->bi_private = &complete;
- ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
- if (PAGE_SIZE != ret) {
- bio_put(bio);
- return -EIO;
- }
+ bio_add_page(bio, page->page, PAGE_SIZE, 0);
btrfsic_submit_bio(READ, bio);
/* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
have_csum, csum, generation,
csum_size);
- return 0;
+ return;
}
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = fs_info->extent_root;
void *mapped_buffer;
- BUG_ON(!sblock->pagev[0].page);
+ WARN_ON(!sblock->pagev[0]->page);
if (is_metadata) {
struct btrfs_header *h;
- mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
h = (struct btrfs_header *)mapped_buffer;
- if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+ if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
if (!have_csum)
return;
- mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
}
for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
page_num++;
if (page_num >= sblock->page_count)
break;
- BUG_ON(!sblock->pagev[page_num].page);
+ WARN_ON(!sblock->pagev[page_num]->page);
- mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
}
btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write)
{
- struct scrub_page *page_bad = sblock_bad->pagev + page_num;
- struct scrub_page *page_good = sblock_good->pagev + page_num;
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *page_good = sblock_good->pagev[page_num];
- BUG_ON(sblock_bad->pagev[page_num].page == NULL);
- BUG_ON(sblock_good->pagev[page_num].page == NULL);
+ BUG_ON(page_bad->page == NULL);
+ BUG_ON(page_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || page_bad->io_error) {
struct bio *bio;
int ret;
DECLARE_COMPLETION_ONSTACK(complete);
+ if (!page_bad->dev->bdev) {
+ printk_ratelimited(KERN_WARNING
+ "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+ return -EIO;
+ }
+
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (!bio_flagged(bio, BIO_UPTODATE)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
+ btrfs_dev_replace_stats_inc(
+ &sblock_bad->sctx->dev_root->fs_info->
+ dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return 0;
}
-static void scrub_checksum(struct scrub_block *sblock)
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+ int page_num;
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ int ret;
+
+ ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ if (ret)
+ btrfs_dev_replace_stats_inc(
+ &sblock->sctx->dev_root->fs_info->dev_replace.
+ num_write_errors);
+ }
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+ int page_num)
+{
+ struct scrub_page *spage = sblock->pagev[page_num];
+
+ BUG_ON(spage->page == NULL);
+ if (spage->io_error) {
+ void *mapped_buffer = kmap_atomic(spage->page);
+
+ memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+ flush_dcache_page(spage->page);
+ kunmap_atomic(mapped_buffer);
+ }
+ return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage)
+{
+ struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+ struct scrub_bio *sbio;
+ int ret;
+
+ mutex_lock(&wr_ctx->wr_lock);
+again:
+ if (!wr_ctx->wr_curr_bio) {
+ wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+ GFP_NOFS);
+ if (!wr_ctx->wr_curr_bio) {
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -ENOMEM;
+ }
+ wr_ctx->wr_curr_bio->sctx = sctx;
+ wr_ctx->wr_curr_bio->page_count = 0;
+ }
+ sbio = wr_ctx->wr_curr_bio;
+ if (sbio->page_count == 0) {
+ struct bio *bio;
+
+ sbio->physical = spage->physical_for_dev_replace;
+ sbio->logical = spage->logical;
+ sbio->dev = wr_ctx->tgtdev;
+ bio = sbio->bio;
+ if (!bio) {
+ bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+ if (!bio) {
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -ENOMEM;
+ }
+ sbio->bio = bio;
+ }
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_wr_bio_end_io;
+ bio->bi_bdev = sbio->dev->bdev;
+ bio->bi_sector = sbio->physical >> 9;
+ sbio->err = 0;
+ } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ spage->physical_for_dev_replace ||
+ sbio->logical + sbio->page_count * PAGE_SIZE !=
+ spage->logical) {
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ if (sbio->page_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -EIO;
+ }
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ sbio->pagev[sbio->page_count] = spage;
+ scrub_page_get(spage);
+ sbio->page_count++;
+ if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+ scrub_wr_submit(sctx);
+ mutex_unlock(&wr_ctx->wr_lock);
+
+ return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+ struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+ struct scrub_bio *sbio;
+
+ if (!wr_ctx->wr_curr_bio)
+ return;
+
+ sbio = wr_ctx->wr_curr_bio;
+ wr_ctx->wr_curr_bio = NULL;
+ WARN_ON(!sbio->bio->bi_bdev);
+ scrub_pending_bio_inc(sctx);
+ /* process all writes in a single worker thread. Then the block layer
+ * orders the requests before sending them to the driver which
+ * doubled the write performance on spinning disks when measured
+ * with Linux 3.5 */
+ btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+ sbio->err = err;
+ sbio->bio = bio;
+
+ sbio->work.func = scrub_wr_bio_end_io_worker;
+ btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_ctx *sctx = sbio->sctx;
+ int i;
+
+ WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+ if (sbio->err) {
+ struct btrfs_dev_replace *dev_replace =
+ &sbio->sctx->dev_root->fs_info->dev_replace;
+
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+
+ spage->io_error = 1;
+ btrfs_dev_replace_stats_inc(&dev_replace->
+ num_write_errors);
+ }
+ }
+
+ for (i = 0; i < sbio->page_count; i++)
+ scrub_page_put(sbio->pagev[i]);
+
+ bio_put(sbio->bio);
+ kfree(sbio);
+ scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
{
u64 flags;
int ret;
- BUG_ON(sblock->page_count < 1);
- flags = sblock->pagev[0].flags;
+ WARN_ON(sblock->page_count < 1);
+ flags = sblock->pagev[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
WARN_ON(1);
if (ret)
scrub_handle_errored_block(sblock);
+
+ return ret;
}
static int scrub_checksum_data(struct scrub_block *sblock)
{
- struct scrub_dev *sdev = sblock->sdev;
+ struct scrub_ctx *sctx = sblock->sctx;
u8 csum[BTRFS_CSUM_SIZE];
u8 *on_disk_csum;
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
int fail = 0;
- struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_root *root = sctx->dev_root;
u64 len;
int index;
BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0].have_csum)
+ if (!sblock->pagev[0]->have_csum)
return 0;
- on_disk_csum = sblock->pagev[0].csum;
- page = sblock->pagev[0].page;
+ on_disk_csum = sblock->pagev[0]->csum;
+ page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
- len = sdev->sectorsize;
+ len = sctx->sectorsize;
index = 0;
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
buffer = kmap_atomic(page);
}
btrfs_csum_final(crc, csum);
- if (memcmp(csum, on_disk_csum, sdev->csum_size))
+ if (memcmp(csum, on_disk_csum, sctx->csum_size))
fail = 1;
return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
- struct scrub_dev *sdev = sblock->sdev;
+ struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
- struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
int index;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0].page;
+ page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
h = (struct btrfs_header *)mapped_buffer;
- memcpy(on_disk_csum, h->csum, sdev->csum_size);
+ memcpy(on_disk_csum, h->csum, sctx->csum_size);
/*
* we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* b) the page is already kmapped
*/
- if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+ if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
++fail;
- if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+ if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
++fail;
if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
BTRFS_UUID_SIZE))
++fail;
- BUG_ON(sdev->nodesize != sdev->leafsize);
- len = sdev->nodesize - BTRFS_CSUM_SIZE;
+ WARN_ON(sctx->nodesize != sctx->leafsize);
+ len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++crc_fail;
return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
- struct scrub_dev *sdev = sblock->sdev;
- struct btrfs_root *root = sdev->dev->dev_root;
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
int index;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0].page;
+ page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
s = (struct btrfs_super_block *)mapped_buffer;
- memcpy(on_disk_csum, s->csum, sdev->csum_size);
+ memcpy(on_disk_csum, s->csum, sctx->csum_size);
- if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+ if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
++fail_cor;
- if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+ if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
++fail_gen;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
* They will get written with the next transaction commit
* anyway
*/
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.super_errors;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(sdev->dev,
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sdev->dev,
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
int i;
for (i = 0; i < sblock->page_count; i++)
- if (sblock->pagev[i].page)
- __free_page(sblock->pagev[i].page);
+ scrub_page_put(sblock->pagev[i]);
kfree(sblock);
}
}
-static void scrub_submit(struct scrub_dev *sdev)
+static void scrub_page_get(struct scrub_page *spage)
+{
+ atomic_inc(&spage->ref_count);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+ if (atomic_dec_and_test(&spage->ref_count)) {
+ if (spage->page)
+ __free_page(spage->page);
+ kfree(spage);
+ }
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
- if (sdev->curr == -1)
+ if (sctx->curr == -1)
return;
- sbio = sdev->bios[sdev->curr];
- sdev->curr = -1;
- atomic_inc(&sdev->in_flight);
+ sbio = sctx->bios[sctx->curr];
+ sctx->curr = -1;
+ scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(READ, sbio->bio);
+ if (!sbio->bio->bi_bdev) {
+ /*
+ * this case should not happen. If btrfs_map_block() is
+ * wrong, it could happen for dev-replace operations on
+ * missing devices when no mirrors are available, but in
+ * this case it should already fail the mount.
+ * This case is handled correctly (but _very_ slowly).
+ */
+ printk_ratelimited(KERN_WARNING
+ "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+ bio_endio(sbio->bio, -EIO);
+ } else {
+ btrfsic_submit_bio(READ, sbio->bio);
+ }
}
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
- struct scrub_page *spage)
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage)
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
/*
* grab a fresh bio or wait for one to become available
*/
- while (sdev->curr == -1) {
- spin_lock(&sdev->list_lock);
- sdev->curr = sdev->first_free;
- if (sdev->curr != -1) {
- sdev->first_free = sdev->bios[sdev->curr]->next_free;
- sdev->bios[sdev->curr]->next_free = -1;
- sdev->bios[sdev->curr]->page_count = 0;
- spin_unlock(&sdev->list_lock);
+ while (sctx->curr == -1) {
+ spin_lock(&sctx->list_lock);
+ sctx->curr = sctx->first_free;
+ if (sctx->curr != -1) {
+ sctx->first_free = sctx->bios[sctx->curr]->next_free;
+ sctx->bios[sctx->curr]->next_free = -1;
+ sctx->bios[sctx->curr]->page_count = 0;
+ spin_unlock(&sctx->list_lock);
} else {
- spin_unlock(&sdev->list_lock);
- wait_event(sdev->list_wait, sdev->first_free != -1);
+ spin_unlock(&sctx->list_lock);
+ wait_event(sctx->list_wait, sctx->first_free != -1);
}
}
- sbio = sdev->bios[sdev->curr];
+ sbio = sctx->bios[sctx->curr];
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical;
sbio->logical = spage->logical;
+ sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+ bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
if (!bio)
return -ENOMEM;
sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
- bio->bi_bdev = sdev->dev->bdev;
- bio->bi_sector = spage->physical >> 9;
+ bio->bi_bdev = sbio->dev->bdev;
+ bio->bi_sector = sbio->physical >> 9;
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
sbio->logical + sbio->page_count * PAGE_SIZE !=
- spage->logical) {
- scrub_submit(sdev);
+ spage->logical ||
+ sbio->dev != spage->dev) {
+ scrub_submit(sctx);
goto again;
}
@@ -1542,81 +1951,87 @@ again:
sbio->bio = NULL;
return -EIO;
}
- scrub_submit(sdev);
+ scrub_submit(sctx);
goto again;
}
- scrub_block_get(sblock); /* one for the added page */
+ scrub_block_get(sblock); /* one for the page added to the bio */
atomic_inc(&sblock->outstanding_pages);
sbio->page_count++;
- if (sbio->page_count == sdev->pages_per_bio)
- scrub_submit(sdev);
+ if (sbio->page_count == sctx->pages_per_rd_bio)
+ scrub_submit(sctx);
return 0;
}
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, int mirror_num,
- u8 *csum, int force)
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum, int force,
+ u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
if (!sblock) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.malloc_errors++;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page later on */
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
atomic_set(&sblock->ref_count, 1);
- sblock->sdev = sdev;
+ sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
- struct scrub_page *spage = sblock->pagev + index;
+ struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
- spage->page = alloc_page(GFP_NOFS);
- if (!spage->page) {
- spin_lock(&sdev->stat_lock);
- sdev->stat.malloc_errors++;
- spin_unlock(&sdev->stat_lock);
- while (index > 0) {
- index--;
- __free_page(sblock->pagev[index].page);
- }
- kfree(sblock);
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
return -ENOMEM;
}
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
spage->sblock = sblock;
- spage->dev = sdev->dev;
+ spage->dev = dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
spage->physical = physical;
+ spage->physical_for_dev_replace = physical_for_dev_replace;
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
- memcpy(spage->csum, csum, sdev->csum_size);
+ memcpy(spage->csum, csum, sctx->csum_size);
} else {
spage->have_csum = 0;
}
sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
len -= l;
logical += l;
physical += l;
+ physical_for_dev_replace += l;
}
- BUG_ON(sblock->page_count == 0);
+ WARN_ON(sblock->page_count == 0);
for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev + index;
+ struct scrub_page *spage = sblock->pagev[index];
int ret;
- ret = scrub_add_page_to_bio(sdev, spage);
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
}
if (force)
- scrub_submit(sdev);
+ scrub_submit(sctx);
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
static void scrub_bio_end_io(struct bio *bio, int err)
{
struct scrub_bio *sbio = bio->bi_private;
- struct scrub_dev *sdev = sbio->sdev;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
sbio->err = err;
sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
static void scrub_bio_end_io_worker(struct btrfs_work *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- struct scrub_dev *sdev = sbio->sdev;
+ struct scrub_ctx *sctx = sbio->sctx;
int i;
- BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+ BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
if (sbio->err) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
bio_put(sbio->bio);
sbio->bio = NULL;
- spin_lock(&sdev->list_lock);
- sbio->next_free = sdev->first_free;
- sdev->first_free = sbio->index;
- spin_unlock(&sdev->list_lock);
- atomic_dec(&sdev->in_flight);
- wake_up(&sdev->list_wait);
+ spin_lock(&sctx->list_lock);
+ sbio->next_free = sctx->first_free;
+ sctx->first_free = sbio->index;
+ spin_unlock(&sctx->list_lock);
+
+ if (sctx->is_dev_replace &&
+ atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
}
static void scrub_block_complete(struct scrub_block *sblock)
{
- if (!sblock->no_io_error_seen)
+ if (!sblock->no_io_error_seen) {
scrub_handle_errored_block(sblock);
- else
- scrub_checksum(sblock);
+ } else {
+ /*
+ * if has checksum error, write via repair mechanism in
+ * dev replace case, otherwise write here in dev replace
+ * case.
+ */
+ if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock);
+ }
}
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
unsigned long i;
unsigned long num_sectors;
- while (!list_empty(&sdev->csum_list)) {
- sum = list_first_entry(&sdev->csum_list,
+ while (!list_empty(&sctx->csum_list)) {
+ sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
if (sum->bytenr > logical)
return 0;
if (sum->bytenr + sum->len > logical)
break;
- ++sdev->stat.csum_discards;
+ ++sctx->stat.csum_discards;
list_del(&sum->list);
kfree(sum);
sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
if (!sum)
return 0;
- num_sectors = sum->len / sdev->sectorsize;
+ num_sectors = sum->len / sctx->sectorsize;
for (i = 0; i < num_sectors; ++i) {
if (sum->sums[i].bytenr == logical) {
- memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+ memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
ret = 1;
break;
}
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
}
/* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, int mirror_num)
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sdev->sectorsize;
- spin_lock(&sdev->stat_lock);
- sdev->stat.data_extents_scrubbed++;
- sdev->stat.data_bytes_scrubbed += len;
- spin_unlock(&sdev->stat_lock);
+ blocksize = sctx->sectorsize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.data_extents_scrubbed++;
+ sctx->stat.data_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- BUG_ON(sdev->nodesize != sdev->leafsize);
- blocksize = sdev->nodesize;
- spin_lock(&sdev->stat_lock);
- sdev->stat.tree_extents_scrubbed++;
- sdev->stat.tree_bytes_scrubbed += len;
- spin_unlock(&sdev->stat_lock);
+ WARN_ON(sctx->nodesize != sctx->leafsize);
+ blocksize = sctx->nodesize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.tree_extents_scrubbed++;
+ sctx->stat.tree_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
} else {
- blocksize = sdev->sectorsize;
- BUG_ON(1);
+ blocksize = sctx->sectorsize;
+ WARN_ON(1);
}
while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sdev, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, l, csum);
if (have_csum == 0)
- ++sdev->stat.no_csum;
+ ++sctx->stat.no_csum;
+ if (sctx->is_dev_replace && !have_csum) {
+ ret = copy_nocow_pages(sctx, logical, l,
+ mirror_num,
+ physical_for_dev_replace);
+ goto behind_scrub_pages;
+ }
}
- ret = scrub_pages(sdev, logical, l, physical, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0);
+ ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+ mirror_num, have_csum ? csum : NULL, 0,
+ physical_for_dev_replace);
+behind_scrub_pages:
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
+ physical_for_dev_replace += l;
}
return 0;
}
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
- struct map_lookup *map, int num, u64 base, u64 length)
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *scrub_dev,
+ int num, u64 base, u64 length,
+ int is_dev_replace)
{
struct btrfs_path *path;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
struct reada_control *reada2;
struct btrfs_key key_start;
struct btrfs_key key_end;
-
u64 increment = map->stripe_len;
u64 offset;
+ u64 extent_logical;
+ u64 extent_physical;
+ u64 extent_len;
+ struct btrfs_device *extent_dev;
+ int extent_mirror_num;
nstripes = length;
offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
*/
logical = base + offset;
- wait_event(sdev->list_wait,
- atomic_read(&sdev->in_flight) == 0);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
* canceled?
*/
if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sdev->cancel_req)) {
+ atomic_read(&sctx->cancel_req)) {
ret = -ECANCELED;
goto out;
}
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
*/
if (atomic_read(&fs_info->scrub_pause_req)) {
/* push queued extents */
- scrub_submit(sdev);
- wait_event(sdev->list_wait,
- atomic_read(&sdev->in_flight) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
ret = btrfs_lookup_csums_range(csum_root, logical,
logical + map->stripe_len - 1,
- &sdev->csum_list, 1);
+ &sctx->csum_list, 1);
if (ret)
goto out;
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
key.objectid;
}
- ret = scrub_extent(sdev, key.objectid, key.offset,
- key.objectid - logical + physical,
- flags, generation, mirror_num);
+ extent_logical = key.objectid;
+ extent_physical = key.objectid - logical + physical;
+ extent_len = key.offset;
+ extent_dev = scrub_dev;
+ extent_mirror_num = mirror_num;
+ if (is_dev_replace)
+ scrub_remap_extent(fs_info, extent_logical,
+ extent_len, &extent_physical,
+ &extent_dev,
+ &extent_mirror_num);
+ ret = scrub_extent(sctx, extent_logical, extent_len,
+ extent_physical, extent_dev, flags,
+ generation, extent_mirror_num,
+ key.objectid - logical + physical);
if (ret)
goto out;
@@ -2016,29 +2477,34 @@ next:
btrfs_release_path(path);
logical += increment;
physical += map->stripe_len;
- spin_lock(&sdev->stat_lock);
- sdev->stat.last_physical = physical;
- spin_unlock(&sdev->stat_lock);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.last_physical = physical;
+ spin_unlock(&sctx->stat_lock);
}
+out:
/* push queued extents */
- scrub_submit(sdev);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
-out:
blk_finish_plug(&plug);
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
- u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
- u64 dev_offset)
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 length,
+ u64 dev_offset, int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
- &sdev->dev->dev_root->fs_info->mapping_tree;
+ &sctx->dev_root->fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
int i;
- int ret = -EINVAL;
+ int ret = 0;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
goto out;
for (i = 0; i < map->num_stripes; ++i) {
- if (map->stripes[i].dev == sdev->dev &&
+ if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+ ret = scrub_stripe(sctx, map, scrub_dev, i,
+ chunk_offset, length,
+ is_dev_replace);
if (ret)
goto out;
}
@@ -2069,11 +2537,13 @@ out:
}
static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev, u64 start, u64 end,
+ int is_dev_replace)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 length;
u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_block_group_cache *cache;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
path = btrfs_alloc_path();
if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
path->search_commit_root = 1;
path->skip_locking = 1;
- key.objectid = sdev->dev->devid;
+ key.objectid = scrub_dev->devid;
key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
-
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != sdev->dev->devid)
+ if (found_key.objectid != scrub_dev->devid)
break;
if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
ret = -ENOENT;
break;
}
- ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
- chunk_offset, length, found_key.offset);
+ dev_replace->cursor_right = found_key.offset + length;
+ dev_replace->cursor_left = found_key.offset;
+ dev_replace->item_needs_writeback = 1;
+ ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+ chunk_offset, length, found_key.offset,
+ is_dev_replace);
+
+ /*
+ * flush, submit all pending read and write bios, afterwards
+ * wait for them.
+ * Note that in the dev replace case, a read request causes
+ * write requests that are submitted in the read completion
+ * worker. Therefore in the current situation, it is required
+ * that all write requests are flushed, so that all read and
+ * write requests are really completed when bios_in_flight
+ * changes to 0.
+ */
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->workers_pending) == 0);
+
+ mutex_lock(&fs_info->scrub_lock);
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
btrfs_put_block_group(cache);
if (ret)
break;
+ if (is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0) {
+ ret = -EIO;
+ break;
+ }
+ if (sctx->stat.malloc_errors > 0) {
+ ret = -ENOMEM;
+ break;
+ }
key.offset = found_key.offset + length;
btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
return ret < 0 ? ret : 0;
}
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev)
{
int i;
u64 bytenr;
u64 gen;
int ret;
- struct btrfs_device *device = sdev->dev;
- struct btrfs_root *root = device->dev_root;
+ struct btrfs_root *root = sctx->dev_root;
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
break;
- ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+ ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, 1, bytenr);
if (ret)
return ret;
}
- wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
return 0;
}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ int is_dev_replace)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
mutex_lock(&fs_info->scrub_lock);
if (fs_info->scrub_workers_refcnt == 0) {
- btrfs_init_workers(&fs_info->scrub_workers, "scrub",
- fs_info->thread_pool_size, &fs_info->generic_worker);
+ if (is_dev_replace)
+ btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+ &fs_info->generic_worker);
+ else
+ btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
fs_info->scrub_workers.idle_thresh = 4;
ret = btrfs_start_workers(&fs_info->scrub_workers);
if (ret)
goto out;
+ btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+ "scrubwrc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+ ret = btrfs_start_workers(
+ &fs_info->scrub_wr_completion_workers);
+ if (ret)
+ goto out;
+ btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+ &fs_info->generic_worker);
+ ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+ if (ret)
+ goto out;
}
++fs_info->scrub_workers_refcnt;
out:
@@ -2223,40 +2764,41 @@ out:
return ret;
}
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
mutex_lock(&fs_info->scrub_lock);
- if (--fs_info->scrub_workers_refcnt == 0)
+ if (--fs_info->scrub_workers_refcnt == 0) {
btrfs_stop_workers(&fs_info->scrub_workers);
+ btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+ btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+ }
WARN_ON(fs_info->scrub_workers_refcnt < 0);
mutex_unlock(&fs_info->scrub_lock);
}
-
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
- struct btrfs_scrub_progress *progress, int readonly)
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace)
{
- struct scrub_dev *sdev;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
- if (btrfs_fs_closing(root->fs_info))
+ if (btrfs_fs_closing(fs_info))
return -EINVAL;
/*
* check some assumptions
*/
- if (root->nodesize != root->leafsize) {
+ if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
printk(KERN_ERR
"btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
- root->nodesize, root->leafsize);
+ fs_info->chunk_root->nodesize,
+ fs_info->chunk_root->leafsize);
return -EINVAL;
}
- if (root->nodesize > BTRFS_STRIPE_LEN) {
+ if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
* the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
*/
printk(KERN_ERR
"btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
- root->nodesize, BTRFS_STRIPE_LEN);
+ fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
return -EINVAL;
}
- if (root->sectorsize != PAGE_SIZE) {
+ if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
/* not supported for data w/o checksums */
printk(KERN_ERR
"btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
- root->sectorsize, (unsigned long long)PAGE_SIZE);
+ fs_info->chunk_root->sectorsize,
+ (unsigned long long)PAGE_SIZE);
return -EINVAL;
}
- ret = scrub_workers_get(root);
+ if (fs_info->chunk_root->nodesize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+ fs_info->chunk_root->sectorsize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ /*
+ * would exhaust the array bounds of pagev member in
+ * struct scrub_block
+ */
+ pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+ fs_info->chunk_root->nodesize,
+ SCRUB_MAX_PAGES_PER_BLOCK,
+ fs_info->chunk_root->sectorsize,
+ SCRUB_MAX_PAGES_PER_BLOCK);
+ return -EINVAL;
+ }
+
+ ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret)
return ret;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, devid, NULL, NULL);
- if (!dev || dev->missing) {
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(root);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!dev || (dev->missing && !is_dev_replace)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(fs_info);
return -ENODEV;
}
mutex_lock(&fs_info->scrub_lock);
- if (!dev->in_fs_metadata) {
+ if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(root);
- return -ENODEV;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(fs_info);
+ return -EIO;
}
- if (dev->scrub_device) {
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (dev->scrub_device ||
+ (!is_dev_replace &&
+ btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(root);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(fs_info);
return -EINPROGRESS;
}
- sdev = scrub_setup_dev(dev);
- if (IS_ERR(sdev)) {
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ sctx = scrub_setup_ctx(dev, is_dev_replace);
+ if (IS_ERR(sctx)) {
mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(root);
- return PTR_ERR(sdev);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(fs_info);
+ return PTR_ERR(sctx);
}
- sdev->readonly = readonly;
- dev->scrub_device = sdev;
+ sctx->readonly = readonly;
+ dev->scrub_device = sctx;
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- down_read(&fs_info->scrub_super_lock);
- ret = scrub_supers(sdev);
- up_read(&fs_info->scrub_super_lock);
+ if (!is_dev_replace) {
+ down_read(&fs_info->scrub_super_lock);
+ ret = scrub_supers(sctx, dev);
+ up_read(&fs_info->scrub_super_lock);
+ }
if (!ret)
- ret = scrub_enumerate_chunks(sdev, start, end);
+ ret = scrub_enumerate_chunks(sctx, dev, start, end,
+ is_dev_replace);
- wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
wake_up(&fs_info->scrub_pause_wait);
- wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
if (progress)
- memcpy(progress, &sdev->stat, sizeof(*progress));
+ memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_lock(&fs_info->scrub_lock);
dev->scrub_device = NULL;
mutex_unlock(&fs_info->scrub_lock);
- scrub_free_dev(sdev);
- scrub_workers_put(root);
+ scrub_free_ctx(sctx);
+ scrub_workers_put(fs_info);
return ret;
}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
up_write(&root->fs_info->scrub_super_lock);
}
-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
-
mutex_lock(&fs_info->scrub_lock);
if (!atomic_read(&fs_info->scrubs_running)) {
mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev)
{
- return __btrfs_scrub_cancel(root->fs_info);
-}
-
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct scrub_dev *sdev;
+ struct scrub_ctx *sctx;
mutex_lock(&fs_info->scrub_lock);
- sdev = dev->scrub_device;
- if (!sdev) {
+ sctx = dev->scrub_device;
+ if (!sctx) {
mutex_unlock(&fs_info->scrub_lock);
return -ENOTCONN;
}
- atomic_inc(&sdev->cancel_req);
+ atomic_inc(&sctx->cancel_req);
while (dev->scrub_device) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
* does not go away in cancel_dev. FIXME: find a better solution
*/
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!dev) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -ENODEV;
}
- ret = btrfs_scrub_cancel_dev(root, dev);
+ ret = btrfs_scrub_cancel_dev(fs_info, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress)
{
struct btrfs_device *dev;
- struct scrub_dev *sdev = NULL;
+ struct scrub_ctx *sctx = NULL;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, devid, NULL, NULL);
+ dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (dev)
- sdev = dev->scrub_device;
- if (sdev)
- memcpy(progress, &sdev->stat, sizeof(*progress));
+ sctx = dev->scrub_device;
+ if (sctx)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+ return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u64 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
+{
+ u64 mapped_length;
+ struct btrfs_bio *bbio = NULL;
+ int ret;
+
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, READ, extent_logical,
+ &mapped_length, &bbio, 0);
+ if (ret || !bbio || mapped_length < extent_len ||
+ !bbio->stripes[0].dev->bdev) {
+ kfree(bbio);
+ return;
+ }
+
+ *extent_physical = bbio->stripes[0].physical;
+ *extent_mirror_num = bbio->mirror_num;
+ *extent_dev = bbio->stripes[0].dev;
+ kfree(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+ struct scrub_wr_ctx *wr_ctx,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev,
+ int is_dev_replace)
+{
+ WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+ mutex_init(&wr_ctx->wr_lock);
+ wr_ctx->wr_curr_bio = NULL;
+ if (!is_dev_replace)
+ return 0;
+
+ WARN_ON(!dev->bdev);
+ wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+ bio_get_nr_vecs(dev->bdev));
+ wr_ctx->tgtdev = dev;
+ atomic_set(&wr_ctx->flush_all_writes, 0);
+ return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+ mutex_lock(&wr_ctx->wr_lock);
+ kfree(wr_ctx->wr_curr_bio);
+ wr_ctx->wr_curr_bio = NULL;
+ mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ int mirror_num, u64 physical_for_dev_replace)
+{
+ struct scrub_copy_nocow_ctx *nocow_ctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+ nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+ if (!nocow_ctx) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ scrub_pending_trans_workers_inc(sctx);
+
+ nocow_ctx->sctx = sctx;
+ nocow_ctx->logical = logical;
+ nocow_ctx->len = len;
+ nocow_ctx->mirror_num = mirror_num;
+ nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+ nocow_ctx->work.func = copy_nocow_pages_worker;
+ btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+ &nocow_ctx->work);
+
+ return 0;
+}
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+ struct scrub_copy_nocow_ctx *nocow_ctx =
+ container_of(work, struct scrub_copy_nocow_ctx, work);
+ struct scrub_ctx *sctx = nocow_ctx->sctx;
+ u64 logical = nocow_ctx->logical;
+ u64 len = nocow_ctx->len;
+ int mirror_num = nocow_ctx->mirror_num;
+ u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+ int ret;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ int not_written = 0;
+
+ fs_info = sctx->dev_root->fs_info;
+ root = fs_info->extent_root;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ not_written = 1;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ not_written = 1;
+ goto out;
+ }
+
+ ret = iterate_inodes_from_logical(logical, fs_info, path,
+ copy_nocow_pages_for_inode,
+ nocow_ctx);
+ if (ret != 0 && ret != -ENOENT) {
+ pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
+ (unsigned long long)logical,
+ (unsigned long long)physical_for_dev_replace,
+ (unsigned long long)len,
+ (unsigned long long)mirror_num, ret);
+ not_written = 1;
+ goto out;
+ }
+
+out:
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans, root);
+ if (not_written)
+ btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+ num_uncorrectable_read_errors);
+
+ btrfs_free_path(path);
+ kfree(nocow_ctx);
+
+ scrub_pending_trans_workers_dec(sctx);
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ unsigned long index;
+ struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+ int ret = 0;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ struct btrfs_root *local_root;
+ u64 physical_for_dev_replace;
+ u64 len;
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+
+ key.objectid = root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(local_root))
+ return PTR_ERR(local_root);
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.objectid = inum;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+ len = nocow_ctx->len;
+ while (len >= PAGE_CACHE_SIZE) {
+ struct page *page = NULL;
+ int ret_sub;
+
+ index = offset >> PAGE_CACHE_SHIFT;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ pr_err("find_or_create_page() failed\n");
+ ret = -ENOMEM;
+ goto next_page;
+ }
+
+ if (PageUptodate(page)) {
+ if (PageDirty(page))
+ goto next_page;
+ } else {
+ ClearPageError(page);
+ ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+ io_tree,
+ page, btrfs_get_extent,
+ nocow_ctx->mirror_num);
+ if (ret_sub) {
+ ret = ret_sub;
+ goto next_page;
+ }
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto next_page;
+ }
+ }
+ ret_sub = write_page_nocow(nocow_ctx->sctx,
+ physical_for_dev_replace, page);
+ if (ret_sub) {
+ ret = ret_sub;
+ goto next_page;
+ }
+
+next_page:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ offset += PAGE_CACHE_SIZE;
+ physical_for_dev_replace += PAGE_CACHE_SIZE;
+ len -= PAGE_CACHE_SIZE;
+ }
+
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+ u64 physical_for_dev_replace, struct page *page)
+{
+ struct bio *bio;
+ struct btrfs_device *dev;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(compl);
+
+ dev = sctx->wr_ctx.tgtdev;
+ if (!dev)
+ return -EIO;
+ if (!dev->bdev) {
+ printk_ratelimited(KERN_WARNING
+ "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ return -EIO;
+ }
+ bio = bio_alloc(GFP_NOFS, 1);
+ if (!bio) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ bio->bi_private = &compl;
+ bio->bi_end_io = scrub_complete_bio_end_io;
+ bio->bi_size = 0;
+ bio->bi_sector = physical_for_dev_replace >> 9;
+ bio->bi_bdev = dev->bdev;
+ ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+ bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ return -EIO;
+ }
+ btrfsic_submit_bio(WRITE_SYNC, bio);
+ wait_for_completion(&compl);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ goto leave_with_eio;
+
+ bio_put(bio);
+ return 0;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
if (!path)
return -ENOMEM;
- spin_lock(&send_root->root_times_lock);
+ spin_lock(&send_root->root_item_lock);
start_ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_times_lock);
+ spin_unlock(&send_root->root_item_lock);
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
* Make sure the tree has not changed after re-joining. We detect this
* by comparing start_ctransid and ctransid. They should always match.
*/
- spin_lock(&send_root->root_times_lock);
+ spin_lock(&send_root->root_item_lock);
ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_times_lock);
+ spin_unlock(&send_root->root_item_lock);
if (ctransid != start_ctransid) {
WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
+#include "dev-replace.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
- __btrfs_scrub_cancel(fs_info);
+ /*
+ * Note that a running device replace operation is not
+ * canceled here although there is no way to update
+ * the progress. It would add the risk of a deadlock,
+ * therefore the canceling is ommited. The only penalty
+ * is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is
+ * mounted writeable again, the device replace
+ * operation continues.
+ */
// WARN_ON(1);
}
}
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+ new_pool_size);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
if (*flags & MS_RDONLY) {
+ /*
+ * this also happens on 'umount -rf' or on shutdown, when
+ * the filesystem is busy.
+ */
sb->s_flags |= MS_RDONLY;
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+
ret = btrfs_commit_super(root);
if (ret)
goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(*flags & MS_RDONLY)) {
+ printk(KERN_WARNING
+ "Btrfs: too many missing devices, writeable remount is not allowed\n");
+ ret = -EACCES;
+ goto restore;
+ }
+
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
ret = -EINVAL;
goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ pr_warn("btrfs: failed to resume dev_replace\n");
+ goto restore;
+ }
sb->s_flags &= ~MS_RDONLY;
}
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
min_stripe_size = BTRFS_STRIPE_LEN;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->in_fs_metadata || !device->bdev)
+ if (!device->in_fs_metadata || !device->bdev ||
+ device->is_tgtdev_for_dev_replace)
continue;
avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_ordered_data;
- err = btrfs_interface_init();
+ err = btrfs_auto_defrag_init();
if (err)
goto free_delayed_inode;
+ err = btrfs_interface_init();
+ if (err)
+ goto free_auto_defrag;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_auto_defrag:
+ btrfs_auto_defrag_exit();
free_delayed_inode:
btrfs_delayed_inode_exit();
free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
+ btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
ordered_data_exit();
extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
#include "tree-log.h"
#include "inode-map.h"
#include "volumes.h"
+#include "dev-replace.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -145,16 +146,12 @@ loop:
* the log must never go across transaction boundaries.
*/
smp_mb();
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+ if (!list_empty(&fs_info->tree_mod_seq_list))
+ WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
"creating a fresh transaction\n");
- WARN_ON(1);
- }
- if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
- printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+ if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+ WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
"creating a fresh transaction\n");
- WARN_ON(1);
- }
atomic_set(&fs_info->tree_mod_seq, 0);
spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
return 0;
}
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- u64 num_items, int type,
- int noflush)
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, int type,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
h = current->journal_info;
h->use_count++;
+ WARN_ON(h->use_count > 2);
h->orig_rsv = h->block_rsv;
h->block_rsv = NULL;
goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
}
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
- if (noflush)
- ret = btrfs_block_rsv_add_noflush(root,
- &root->fs_info->trans_block_rsv,
- num_bytes);
- else
- ret = btrfs_block_rsv_add(root,
- &root->fs_info->trans_block_rsv,
- num_bytes);
+ ret = btrfs_block_rsv_add(root,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, flush);
if (ret)
return ERR_PTR(ret);
}
@@ -422,13 +415,15 @@ got_it:
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items)
{
- return start_transaction(root, num_items, TRANS_START, 0);
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL);
}
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root, int num_items)
{
- return start_transaction(root, num_items, TRANS_START, 1);
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_LIMIT);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
struct btrfs_transaction *cur_trans = NULL, *t;
- int ret;
+ int ret = 0;
- ret = 0;
if (transid) {
if (transid <= root->fs_info->last_trans_committed)
goto out;
+ ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
atomic_inc(&cur_trans->use_count);
+ ret = 0;
break;
}
- if (t->transid > transid)
+ if (t->transid > transid) {
+ ret = 0;
break;
+ }
}
spin_unlock(&root->fs_info->trans_lock);
- ret = -EINVAL;
+ /* The specified transaction doesn't exist */
if (!cur_trans)
- goto out; /* bad transid */
+ goto out;
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
wait_for_commit(root, cur_trans);
-
put_transaction(cur_trans);
- ret = 0;
out:
return ret;
}
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_run_dev_stats(trans, root->fs_info);
- BUG_ON(ret);
+ WARN_ON(ret);
+ ret = btrfs_run_dev_replace(trans, root->fs_info);
+ WARN_ON(ret);
ret = btrfs_run_qgroups(trans, root->fs_info);
BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
switch_commit_root(fs_info->extent_root);
up_write(&fs_info->extent_commit_sem);
+ btrfs_after_dev_replace_commit(fs_info);
+
return 0;
}
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret;
- unsigned long nr;
if (xchg(&root->defrag_running, 1))
return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
ret = btrfs_defrag_leaves(trans, root, cacheonly);
- nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(info->tree_root, nr);
+ btrfs_btree_balance_dirty(info->tree_root);
cond_resched();
if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
- ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
- to_reserve);
+ ret = btrfs_block_rsv_add(root, &pending->block_rsv,
+ to_reserve,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret) {
pending->error = ret;
goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_inode, &key,
BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
- BUG_ON(ret == -EEXIST);
+ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
* We've got freeze protection passed with the transaction.
* Tell lockdep about it.
*/
- rwsem_acquire_read(
- &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+ rwsem_acquire_read(
+ &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
current->journal_info = ac->newtrans;
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* Tell lockdep we've released the freeze rwsem, since the
* async commit thread will be the one to unlock it.
*/
- rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ if (trans->type < TRANS_JOIN_NOLOCK)
+ rwsem_release(
+ &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
schedule_delayed_work(&ac->work, 0);
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+ int snap_pending = 0;
+ int ret;
+
+ if (!flush_on_commit) {
+ spin_lock(&root->fs_info->trans_lock);
+ if (!list_empty(&trans->transaction->pending_snapshots))
+ snap_pending = 1;
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+
+ if (flush_on_commit || snap_pending) {
+ btrfs_start_delalloc_inodes(root, 1);
+ btrfs_wait_ordered_extents(root, 1);
+ }
+
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret)
+ return ret;
+
+ /*
+ * running the delayed items may have added new refs. account
+ * them now so that they hinder processing of more delayed refs
+ * as little as possible.
+ */
+ btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
+ /*
+ * rename don't use btrfs_join_transaction, so, once we
+ * set the transaction to blocked above, we aren't going
+ * to get any new ordered operations. We can safely run
+ * it here and no for sure that nothing new will be added
+ * to the list
+ */
+ btrfs_run_ordered_operations(root, 1);
+
+ return 0;
+}
+
/*
* btrfs_transaction state sequence:
* in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
- int ret = -EIO;
+ int ret;
int should_grow = 0;
unsigned long now = get_seconds();
- int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
- btrfs_run_ordered_operations(root, 0);
+ ret = btrfs_run_ordered_operations(root, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto cleanup_transaction;
+ }
- if (cur_trans->aborted)
+ if (cur_trans->aborted) {
+ ret = cur_trans->aborted;
goto cleanup_transaction;
+ }
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
should_grow = 1;
do {
- int snap_pending = 0;
-
joined = cur_trans->num_joined;
- if (!list_empty(&trans->transaction->pending_snapshots))
- snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- if (flush_on_commit || snap_pending) {
- btrfs_start_delalloc_inodes(root, 1);
- btrfs_wait_ordered_extents(root, 1);
- }
-
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
goto cleanup_transaction;
- /*
- * running the delayed items may have added new refs. account
- * them now so that they hinder processing of more delayed refs
- * as little as possible.
- */
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
- /*
- * rename don't use btrfs_join_transaction, so, once we
- * set the transaction to blocked above, we aren't going
- * to get any new ordered operations. We can safely run
- * it here and no for sure that nothing new will be added
- * to the list
- */
- btrfs_run_ordered_operations(root, 1);
-
prepare_to_wait(&cur_trans->writer_wait, &wait,
TASK_UNINTERRUPTIBLE);
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
} while (atomic_read(&cur_trans->num_writers) > 1 ||
(should_grow && cur_trans->num_joined != joined));
+ ret = btrfs_flush_all_pending_stuffs(trans, root);
+ if (ret)
+ goto cleanup_transaction;
+
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root, int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode, int log_inode_only)
{
- btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
- btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
- btrfs_set_inode_mode(leaf, item, inode->i_mode);
- btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-
- btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_nsec);
-
- btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_nsec);
-
- btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_nsec);
-
- btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-
- btrfs_set_inode_sequence(leaf, item, inode->i_version);
- btrfs_set_inode_transid(leaf, item, trans->transid);
- btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
- btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
- btrfs_set_inode_block_group(leaf, item, 0);
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
if (log_inode_only) {
/* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_inode_generation(leaf, item, 0);
- btrfs_set_inode_size(leaf, item, 0);
+ btrfs_set_token_inode_generation(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, 0, &token);
} else {
- btrfs_set_inode_generation(leaf, item,
- BTRFS_I(inode)->generation);
- btrfs_set_inode_size(leaf, item, inode->i_size);
- }
+ btrfs_set_token_inode_generation(leaf, item,
+ BTRFS_I(inode)->generation,
+ &token);
+ btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+ }
+
+ btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+ btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+ btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+ btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec, &token);
+
+ btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+ &token);
+
+ btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+ btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+ btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+ btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+static int log_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct btrfs_path *path,
+ struct inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_key key;
+ int ret;
+
+ memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*inode_item));
+ if (ret && ret != -EEXIST)
+ return ret;
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ btrfs_release_path(path);
+ return 0;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-struct log_args {
- struct extent_buffer *src;
- u64 next_offset;
- int start_slot;
- int nr;
-};
+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct extent_map *em,
+ struct btrfs_path *path)
+{
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct btrfs_key key, new_key;
+ struct btrfs_map_token token;
+ u64 extent_end;
+ u64 extent_offset = 0;
+ int extent_type;
+ int del_slot = 0;
+ int del_nr = 0;
+ int ret = 0;
+
+ while (1) {
+ btrfs_init_map_token(&token);
+ leaf = path->nodes[0];
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ if (del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ del_slot, del_nr);
+ if (ret)
+ return ret;
+ del_nr = 0;
+ }
+
+ ret = btrfs_next_leaf_write(trans, root, path, 1);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY ||
+ key.offset >= em->start + em->len)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_offset = btrfs_token_file_extent_offset(leaf,
+ fi, &token);
+ extent_end = key.offset +
+ btrfs_token_file_extent_num_bytes(leaf, fi,
+ &token);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
+ } else {
+ BUG();
+ }
+
+ if (extent_end <= em->len + em->start) {
+ if (!del_nr) {
+ del_slot = path->slots[0];
+ }
+ del_nr++;
+ continue;
+ }
+
+ /*
+ * Ok so we'll ignore previous items if we log a new extent,
+ * which can lead to overlapping extents, so if we have an
+ * existing extent we want to adjust we _have_ to check the next
+ * guy to make sure we even need this extent anymore, this keeps
+ * us from panicing in set_item_key_safe.
+ */
+ if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
+ struct btrfs_key tmp_key;
+
+ btrfs_item_key_to_cpu(leaf, &tmp_key,
+ path->slots[0] + 1);
+ if (tmp_key.objectid == btrfs_ino(inode) &&
+ tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
+ tmp_key.offset <= em->start + em->len) {
+ if (!del_nr)
+ del_slot = path->slots[0];
+ del_nr++;
+ continue;
+ }
+ }
+
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = em->start + em->len;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+ extent_offset += em->start + em->len - key.offset;
+ btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
+ &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
+ (em->start + em->len),
+ &token);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+ if (del_nr)
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+
+ return ret;
+}
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path,
- struct btrfs_path *dst_path, struct log_args *args)
+ struct extent_map *em, struct btrfs_path *path)
{
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct list_head ordered_sums;
+ struct btrfs_map_token token;
struct btrfs_key key;
- u64 start = em->mod_start;
- u64 search_start = start;
- u64 len = em->mod_len;
- u64 num_bytes;
- int nritems;
+ u64 csum_offset = em->mod_start - em->start;
+ u64 csum_len = em->mod_len;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
int ret;
+ bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
- start + len, NULL, 0);
- if (ret)
- return ret;
+ INIT_LIST_HEAD(&ordered_sums);
+ btrfs_init_map_token(&token);
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+ path->really_keep_locks = 1;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+ if (ret && ret != -EEXIST) {
+ path->really_keep_locks = 0;
+ return ret;
}
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ &token);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ skip_csum = true;
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_PREALLOC,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG,
+ &token);
+ if (em->block_start == 0)
+ skip_csum = true;
+ }
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start,
+ &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start -
+ extent_offset, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+ &token);
+ }
+
+ btrfs_set_token_file_extent_offset(leaf, fi,
+ em->start - em->orig_start,
+ &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+ &token);
+ btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_mark_buffer_dirty(leaf);
- while (len) {
- if (args->nr)
- goto next_slot;
-again:
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = search_start;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- if (ret) {
- /*
- * A rare case were we can have an em for a section of a
- * larger extent so we need to make sure that this em
- * falls within the extent we've found. If not we just
- * bail and go back to ye-olde way of doing things but
- * it happens often enough in testing that we need to do
- * this dance to make sure.
- */
- do {
- if (path->slots[0] == 0) {
- btrfs_release_path(path);
- if (search_start == 0)
- return -ENOENT;
- search_start--;
- goto again;
- }
-
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0]);
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
- btrfs_release_path(path);
- return -ENOENT;
- }
- } while (key.offset > start);
+ /*
+ * Have to check the extent to the right of us to make sure it doesn't
+ * fall in our current range. We're ok if the previous extent is in our
+ * range since the recovery stuff will run us in key order and thus just
+ * drop the part we overwrote.
+ */
+ ret = drop_adjacent_extents(trans, log, inode, em, path);
+ btrfs_release_path(path);
+ path->really_keep_locks = 0;
+ if (ret) {
+ return ret;
+ }
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
- fi);
- if (key.offset + num_bytes <= start) {
- btrfs_release_path(path);
- return -ENOENT;
- }
- }
- args->src = path->nodes[0];
-next_slot:
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- fi = btrfs_item_ptr(args->src, path->slots[0],
- struct btrfs_file_extent_item);
- if (args->nr &&
- args->start_slot + args->nr == path->slots[0]) {
- args->nr++;
- } else if (args->nr) {
- ret = copy_items(trans, inode, dst_path, args->src,
- args->start_slot, args->nr,
- LOG_INODE_ALL);
- if (ret)
- return ret;
- args->nr = 1;
- args->start_slot = path->slots[0];
- } else if (!args->nr) {
- args->nr = 1;
- args->start_slot = path->slots[0];
- }
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
- if (len < num_bytes) {
- /* I _think_ this is ok, envision we write to a
- * preallocated space that is adjacent to a previously
- * written preallocated space that gets merged when we
- * mark this preallocated space written. If we do not
- * have the adjacent extent in cache then when we copy
- * this extent it could end up being larger than our EM
- * thinks it is, which is a-ok, so just set len to 0.
- */
- len = 0;
- } else {
- len -= num_bytes;
- }
- start = key.offset + num_bytes;
- args->next_offset = start;
- search_start = start;
+ if (skip_csum)
+ return 0;
- if (path->slots[0] < nritems) {
- if (len)
- goto next_slot;
- break;
- }
+ /* block start is already adjusted for the file extent offset. */
+ ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+ em->block_start + csum_offset,
+ em->block_start + csum_offset +
+ csum_len - 1, &ordered_sums, 0);
+ if (ret)
+ return ret;
- if (args->nr) {
- ret = copy_items(trans, inode, dst_path, args->src,
- args->start_slot, args->nr,
- LOG_INODE_ALL);
- if (ret)
- return ret;
- args->nr = 0;
- btrfs_release_path(path);
- }
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
}
- return 0;
+ return ret;
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
- struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *path)
{
- struct log_args args;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- memset(&args, 0, sizeof(args));
-
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
write_unlock(&tree->lock);
- /*
- * If the previous EM and the last extent we left off on aren't
- * sequential then we need to copy the items we have and redo
- * our search
- */
- if (args.nr && em->mod_start != args.next_offset) {
- ret = copy_items(trans, inode, dst_path, args.src,
- args.start_slot, args.nr,
- LOG_INODE_ALL);
- if (ret) {
- free_extent_map(em);
- write_lock(&tree->lock);
- continue;
- }
- btrfs_release_path(path);
- args.nr = 0;
- }
-
- ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+ ret = log_one_extent(trans, inode, root, em, path);
free_extent_map(em);
write_lock(&tree->lock);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- if (!ret && args.nr)
- ret = copy_items(trans, inode, dst_path, args.src,
- args.start_slot, args.nr, LOG_INODE_ALL);
btrfs_release_path(path);
return ret;
}
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
/* today the code can only do partial logging of directories */
- if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) ||
+ (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags) &&
+ inode_only == LOG_INODE_EXISTS))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
ret = btrfs_truncate_inode_items(trans, log,
inode, 0, 0);
- } else {
- fast_search = true;
+ } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
- BTRFS_XATTR_ITEM_KEY);
+ max_key.type);
+ } else {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
+ ret = log_inode_item(trans, log, dst_path, inode);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ goto log_extents;
}
+
}
if (ret) {
err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
ins_nr = 0;
}
+log_extents:
if (fast_search) {
- btrfs_release_path(path);
btrfs_release_path(dst_path);
- ret = btrfs_log_changed_extents(trans, root, inode, path,
- dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
if (ret) {
err = ret;
goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em, *n;
+ write_lock(&tree->lock);
list_for_each_entry_safe(em, n, &tree->modified_extents, list)
list_del_init(&em->list);
+ write_unlock(&tree->lock);
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
-#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
@@ -36,6 +35,8 @@
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
kfree(fs_devices);
}
+static void btrfs_kobject_uevent(struct block_device *bdev,
+ enum kobject_action action)
+{
+ int ret;
+
+ ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+ if (ret)
+ pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+ action,
+ kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+ &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
void btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
return NULL;
}
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+ int flush, struct block_device **bdev,
+ struct buffer_head **bh)
+{
+ int ret;
+
+ *bdev = blkdev_get_by_path(device_path, flags, holder);
+
+ if (IS_ERR(*bdev)) {
+ ret = PTR_ERR(*bdev);
+ printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+ goto error;
+ }
+
+ if (flush)
+ filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ ret = set_blocksize(*bdev, 4096);
+ if (ret) {
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+ invalidate_bdev(*bdev);
+ *bh = btrfs_read_dev_super(*bdev);
+ if (!*bh) {
+ ret = -EINVAL;
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+
+ return 0;
+
+error:
+ *bdev = NULL;
+ *bh = NULL;
+ return ret;
+}
+
static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{
@@ -467,7 +519,8 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
@@ -480,8 +533,9 @@ again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
- if (!latest_transid ||
- device->generation > latest_transid) {
+ if (!device->is_tgtdev_for_dev_replace &&
+ (!latest_transid ||
+ device->generation > latest_transid)) {
latest_devid = device->devid;
latest_transid = device->generation;
latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
continue;
}
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+ /*
+ * In the first step, keep the device which has
+ * the correct fsid and the devid that is used
+ * for the dev_replace procedure.
+ * In the second step, the dev_replace state is
+ * read from the device tree and it is known
+ * whether the procedure is really active or
+ * not, which means whether this device is
+ * used or whether it should be removed.
+ */
+ if (step == 0 || device->is_tgtdev_for_dev_replace) {
+ continue;
+ }
+ }
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
if (device->writeable) {
list_del_init(&device->dev_alloc_list);
device->writeable = 0;
- fs_devices->rw_devices--;
+ if (!device->is_tgtdev_for_dev_replace)
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
if (device->bdev)
fs_devices->open_devices--;
- if (device->writeable) {
+ if (device->writeable && !device->is_tgtdev_for_dev_replace) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
if (!device->name)
continue;
- bdev = blkdev_get_by_path(device->name->str, flags, holder);
- if (IS_ERR(bdev)) {
- printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
- goto error;
- }
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
- invalidate_bdev(bdev);
- set_blocksize(bdev, 4096);
-
- bh = btrfs_read_dev_super(bdev);
- if (!bh)
- goto error_close;
+ ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh);
+ if (ret)
+ continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->rotating = 1;
fs_devices->open_devices++;
- if (device->writeable) {
+ if (device->writeable && !device->is_tgtdev_for_dev_replace) {
fs_devices->rw_devices++;
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
error_brelse:
brelse(bh);
-error_close:
blkdev_put(bdev, flags);
-error:
continue;
}
if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
u64 total_devices;
flags |= FMODE_EXCL;
- bdev = blkdev_get_by_path(path, flags, holder);
-
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
- goto error;
- }
-
mutex_lock(&uuid_mutex);
- ret = set_blocksize(bdev, 4096);
+ ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
if (ret)
- goto error_close;
- bh = btrfs_read_dev_super(bdev);
- if (!bh) {
- ret = -EINVAL;
- goto error_close;
- }
+ goto error;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
- if (disk_super->label[0])
+ if (disk_super->label[0]) {
+ if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "device label %s ", disk_super->label);
- else
+ } else {
printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+ }
printk(KERN_CONT "devid %llu transid %llu %s\n",
(unsigned long long)devid, (unsigned long long)transid, path);
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
brelse(bh);
-error_close:
- mutex_unlock(&uuid_mutex);
blkdev_put(bdev, flags);
error:
+ mutex_unlock(&uuid_mutex);
return ret;
}
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
*length = 0;
- if (start >= device->total_bytes)
+ if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
return 0;
path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
max_hole_size = 0;
hole_size = 0;
- if (search_start >= search_end) {
+ if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
goto error;
}
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
WARN_ON(!device->in_fs_metadata);
+ WARN_ON(device->is_tgtdev_for_dev_replace);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
root->fs_info->avail_system_alloc_bits |
root->fs_info->avail_metadata_alloc_bits;
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
- root->fs_info->fs_devices->num_devices <= 4) {
+ num_devices = root->fs_info->fs_devices->num_devices;
+ btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+ WARN_ON(num_devices < 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
printk(KERN_ERR "btrfs: unable to go below four devices "
"on raid10\n");
ret = -EINVAL;
goto out;
}
- if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
- root->fs_info->fs_devices->num_devices <= 2) {
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
printk(KERN_ERR "btrfs: unable to go below two "
"devices on raid1\n");
ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* is held.
*/
list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata && !tmp->bdev) {
+ if (tmp->in_fs_metadata &&
+ !tmp->is_tgtdev_for_dev_replace &&
+ !tmp->bdev) {
device = tmp;
break;
}
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto out;
}
} else {
- bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
- root->fs_info->bdev_holder);
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
+ ret = btrfs_get_bdev_and_sb(device_path,
+ FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder, 0,
+ &bdev, &bh);
+ if (ret)
goto out;
- }
-
- set_blocksize(bdev, 4096);
- invalidate_bdev(bdev);
- bh = btrfs_read_dev_super(bdev);
- if (!bh) {
- ret = -EINVAL;
- goto error_close;
- }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(root, devid, dev_uuid,
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid,
disk_super->fsid);
if (!device) {
ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
}
+ if (device->is_tgtdev_for_dev_replace) {
+ pr_err("btrfs: unable to remove the dev_replace target dev\n");
+ ret = -EINVAL;
+ goto error_brelse;
+ }
+
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
printk(KERN_ERR "btrfs: unable to remove the only writeable "
"device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
+ /*
+ * TODO: the superblock still includes this device in its num_devices
+ * counter although write_all_supers() is not locked out. This
+ * could give a filesystem state which requires a degraded mount.
+ */
ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
if (ret)
goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
spin_unlock(&root->fs_info->free_chunk_lock);
device->in_fs_metadata = 0;
- btrfs_scrub_cancel_dev(root, device);
+ btrfs_scrub_cancel_dev(root->fs_info, device);
/*
* the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
- if (clear_super) {
+ if (clear_super && disk_super) {
/* make sure this device isn't detected as part of
* the FS anymore
*/
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
ret = 0;
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
error_brelse:
brelse(bh);
-error_close:
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
@@ -1512,6 +1576,112 @@ error_undo:
goto error_brelse;
}
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
+{
+ WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ list_del_rcu(&srcdev->dev_list);
+ list_del_rcu(&srcdev->dev_alloc_list);
+ fs_info->fs_devices->num_devices--;
+ if (srcdev->missing) {
+ fs_info->fs_devices->missing_devices--;
+ fs_info->fs_devices->rw_devices++;
+ }
+ if (srcdev->can_discard)
+ fs_info->fs_devices->num_can_discard--;
+ if (srcdev->bdev)
+ fs_info->fs_devices->open_devices--;
+
+ call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev)
+{
+ struct btrfs_device *next_device;
+
+ WARN_ON(!tgtdev);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ if (tgtdev->bdev) {
+ btrfs_scratch_superblock(tgtdev);
+ fs_info->fs_devices->open_devices--;
+ }
+ fs_info->fs_devices->num_devices--;
+ if (tgtdev->can_discard)
+ fs_info->fs_devices->num_can_discard++;
+
+ next_device = list_entry(fs_info->fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ if (tgtdev->bdev == fs_info->sb->s_bdev)
+ fs_info->sb->s_bdev = next_device->bdev;
+ if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+ fs_info->fs_devices->latest_bdev = next_device->bdev;
+ list_del_rcu(&tgtdev->dev_list);
+
+ call_rcu(&tgtdev->rcu, free_device);
+
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device)
+{
+ int ret = 0;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ u8 *dev_uuid;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+
+ *device = NULL;
+ ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+ root->fs_info->bdev_holder, 0, &bdev, &bh);
+ if (ret)
+ return ret;
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_uuid = disk_super->dev_item.uuid;
+ *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+ disk_super->fsid);
+ brelse(bh);
+ if (!*device)
+ ret = -ENOENT;
+ blkdev_put(bdev, FMODE_READ);
+ return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+ char *device_path,
+ struct btrfs_device **device)
+{
+ *device = NULL;
+ if (strcmp(device_path, "missing") == 0) {
+ struct list_head *devices;
+ struct btrfs_device *tmp;
+
+ devices = &root->fs_info->fs_devices->devices;
+ /*
+ * It is safe to read the devices since the volume_mutex
+ * is held by the caller.
+ */
+ list_for_each_entry(tmp, devices, dev_list) {
+ if (tmp->in_fs_metadata && !tmp->bdev) {
+ *device = tmp;
+ break;
+ }
+ }
+
+ if (!*device) {
+ pr_err("btrfs: no missing device found\n");
+ return -ENOENT;
+ }
+
+ return 0;
+ } else {
+ return btrfs_find_device_by_path(root, device_path, device);
+ }
+}
+
/*
* does all the dirty work required for changing file system's UUID.
*/
@@ -1630,7 +1800,8 @@ next_slot:
read_extent_buffer(leaf, fs_uuid,
(unsigned long)btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
- device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+ fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &root->fs_info->fs_devices->devices;
- /*
- * we have the volume lock, so we don't need the extra
- * device list mutex while reading the list here.
- */
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
goto error;
}
}
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
+ device->is_tgtdev_for_dev_replace = 0;
device->mode = FMODE_EXCL;
set_blocksize(device->bdev, 4096);
@@ -1844,6 +2017,98 @@ error:
return ret;
}
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device_out)
+{
+ struct request_queue *q;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *devices;
+ struct rcu_string *name;
+ int ret = 0;
+
+ *device_out = NULL;
+ if (fs_info->fs_devices->seeding)
+ return -EINVAL;
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ q = bdev_get_queue(bdev);
+ if (blk_queue_discard(q))
+ device->can_discard = 1;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ device->writeable = 1;
+ device->work.func = pending_bios_fn;
+ generate_random_uuid(device->uuid);
+ device->devid = BTRFS_DEV_REPLACE_DEVID;
+ spin_lock_init(&device->io_lock);
+ device->generation = 0;
+ device->io_width = root->sectorsize;
+ device->io_align = root->sectorsize;
+ device->sector_size = root->sectorsize;
+ device->total_bytes = i_size_read(bdev->bd_inode);
+ device->disk_total_bytes = device->total_bytes;
+ device->dev_root = fs_info->dev_root;
+ device->bdev = bdev;
+ device->in_fs_metadata = 1;
+ device->is_tgtdev_for_dev_replace = 1;
+ device->mode = FMODE_EXCL;
+ set_blocksize(device->bdev, 4096);
+ device->fs_devices = fs_info->fs_devices;
+ list_add(&device->dev_list, &fs_info->fs_devices->devices);
+ fs_info->fs_devices->num_devices++;
+ fs_info->fs_devices->open_devices++;
+ if (device->can_discard)
+ fs_info->fs_devices->num_can_discard++;
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ *device_out = device;
+ return ret;
+
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev)
+{
+ WARN_ON(fs_info->fs_devices->rw_devices == 0);
+ tgtdev->io_width = fs_info->dev_root->sectorsize;
+ tgtdev->io_align = fs_info->dev_root->sectorsize;
+ tgtdev->sector_size = fs_info->dev_root->sectorsize;
+ tgtdev->dev_root = fs_info->dev_root;
+ tgtdev->in_fs_metadata = 1;
+}
+
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
if (!device->writeable)
return -EACCES;
- if (new_size <= device->total_bytes)
+ if (new_size <= device->total_bytes ||
+ device->is_tgtdev_for_dev_replace)
return -EINVAL;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
return 1;
}
-static u64 div_factor_fine(u64 num, int factor)
-{
- if (factor <= 0)
- return 0;
- if (factor >= 100)
- return num;
-
- num *= factor;
- do_div(num, 100);
- return num;
-}
-
static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
return 1;
}
-static u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- do_div(num, 10);
- return num;
-}
-
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
- device->total_bytes - device->bytes_used > size_to_free)
+ device->total_bytes - device->bytes_used > size_to_free ||
+ device->is_tgtdev_for_dev_replace)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
u64 allowed;
int mixed = 0;
int ret;
+ u64 num_devices;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
}
+ num_devices = fs_info->fs_devices->num_devices;
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ BUG_ON(num_devices < 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- if (fs_info->fs_devices->num_devices == 1)
+ if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (fs_info->fs_devices->num_devices < 4)
+ else if (num_devices < 4)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
else
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
+ WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
if (IS_ERR(tsk))
return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
- if (new_size >= device->total_bytes)
+ if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ { 2, 1, 0, 4, 2, 2 /* raid10 */ },
+ { 1, 1, 2, 2, 2, 2 /* raid1 */ },
+ { 1, 2, 1, 1, 1, 2 /* dup */ },
+ { 1, 1, 0, 2, 1, 1 /* raid0 */ },
+ { 1, 1, 0, 1, 1, 1 /* single */ },
+};
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int ndevs;
int i;
int j;
+ int index;
BUG_ON(!alloc_profile_is_valid(type, 0));
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
- sub_stripes = 1;
- dev_stripes = 1;
- devs_increment = 1;
- ncopies = 1;
- devs_max = 0; /* 0 == as many as possible */
- devs_min = 1;
+ index = __get_raid_index(type);
- /*
- * define the properties of each RAID type.
- * FIXME: move this to a global table and use it in all RAID
- * calculation code
- */
- if (type & (BTRFS_BLOCK_GROUP_DUP)) {
- dev_stripes = 2;
- ncopies = 2;
- devs_max = 1;
- } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- devs_min = 2;
- } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
- devs_increment = 2;
- ncopies = 2;
- devs_max = 2;
- devs_min = 2;
- } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- sub_stripes = 2;
- devs_increment = 2;
- ncopies = 2;
- devs_min = 4;
- } else {
- devs_max = 1;
- }
+ sub_stripes = btrfs_raid_array[index].sub_stripes;
+ dev_stripes = btrfs_raid_array[index].dev_stripes;
+ devs_max = btrfs_raid_array[index].devs_max;
+ devs_min = btrfs_raid_array[index].devs_min;
+ devs_increment = btrfs_raid_array[index].devs_increment;
+ ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
cur = cur->next;
if (!device->writeable) {
- printk(KERN_ERR
+ WARN(1, KERN_ERR
"btrfs: read-only device in alloc_list\n");
- WARN_ON(1);
continue;
}
- if (!device->in_fs_metadata)
+ if (!device->in_fs_metadata ||
+ device->is_tgtdev_for_dev_replace)
continue;
if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
+ WARN_ON(ndevs > fs_devices->rw_devices);
}
/*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
}
}
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
else
ret = 1;
free_extent_map(em);
+
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ ret++;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
return ret;
}
-static int find_live_mirror(struct map_lookup *map, int first, int num,
- int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first, int num,
+ int optimal, int dev_replace_is_ongoing)
{
int i;
- if (map->stripes[optimal].dev->bdev)
- return optimal;
- for (i = first; i < first + num; i++) {
- if (map->stripes[i].dev->bdev)
- return i;
+ int tolerance;
+ struct btrfs_device *srcdev;
+
+ if (dev_replace_is_ongoing &&
+ fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+ srcdev = fs_info->dev_replace.srcdev;
+ else
+ srcdev = NULL;
+
+ /*
+ * try to avoid the drive that is the source drive for a
+ * dev-replace procedure, only choose it if no other non-missing
+ * mirror is available
+ */
+ for (tolerance = 0; tolerance < 2; tolerance++) {
+ if (map->stripes[optimal].dev->bdev &&
+ (tolerance || map->stripes[optimal].dev != srcdev))
+ return optimal;
+ for (i = first; i < first + num; i++) {
+ if (map->stripes[i].dev->bdev &&
+ (tolerance || map->stripes[i].dev != srcdev))
+ return i;
+ }
}
+
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
return optimal;
}
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int num_stripes;
int max_errors = 0;
struct btrfs_bio *bbio = NULL;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+ int num_alloc_stripes;
+ int patch_the_first_stripe_for_dev_replace = 0;
+ u64 physical_to_patch_in_first_stripe = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
- if (mirror_num > map->num_stripes)
- mirror_num = 0;
-
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
if (!bbio_ret)
goto out;
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (!dev_replace_is_ongoing)
+ btrfs_dev_replace_unlock(dev_replace);
+
+ if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+ !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+ dev_replace->tgtdev != NULL) {
+ /*
+ * in dev-replace case, for repair case (that's the only
+ * case where the mirror is selected explicitly when
+ * calling btrfs_map_block), blocks left of the left cursor
+ * can also be read from the target drive.
+ * For REQ_GET_READ_MIRRORS, the target drive is added as
+ * the last one to the array of stripes. For READ, it also
+ * needs to be supported using the same mirror number.
+ * If the requested block is not left of the left cursor,
+ * EIO is returned. This can happen because btrfs_num_copies()
+ * returns one more in the dev-replace case.
+ */
+ u64 tmp_length = *length;
+ struct btrfs_bio *tmp_bbio = NULL;
+ int tmp_num_stripes;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+ logical, &tmp_length, &tmp_bbio, 0);
+ if (ret) {
+ WARN_ON(tmp_bbio != NULL);
+ goto out;
+ }
+
+ tmp_num_stripes = tmp_bbio->num_stripes;
+ if (mirror_num > tmp_num_stripes) {
+ /*
+ * REQ_GET_READ_MIRRORS does not contain this
+ * mirror, that means that the requested area
+ * is not left of the left cursor
+ */
+ ret = -EIO;
+ kfree(tmp_bbio);
+ goto out;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num
+ * of the source drive. Therefore look it up first.
+ * At the end, patch the device pointer to the one of the
+ * target drive.
+ */
+ for (i = 0; i < tmp_num_stripes; i++) {
+ if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it
+ * simple, only add the mirror with the
+ * lowest physical address
+ */
+ if (found &&
+ physical_of_found <=
+ tmp_bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found =
+ tmp_bbio->stripes[i].physical;
+ }
+ }
+
+ if (found) {
+ mirror_num = index_srcdev + 1;
+ patch_the_first_stripe_for_dev_replace = 1;
+ physical_to_patch_in_first_stripe = physical_of_found;
+ } else {
+ WARN_ON(1);
+ ret = -EIO;
+ kfree(tmp_bbio);
+ goto out;
+ }
+
+ kfree(tmp_bbio);
+ } else if (mirror_num > map->num_stripes) {
+ mirror_num = 0;
+ }
+
num_stripes = 1;
stripe_index = 0;
stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripe_nr_end - stripe_nr_orig);
stripe_index = do_div(stripe_nr, map->num_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (rw & (REQ_WRITE | REQ_DISCARD))
+ if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
else {
- stripe_index = find_live_mirror(map, 0,
+ stripe_index = find_live_mirror(fs_info, map, 0,
map->num_stripes,
- current->pid % map->num_stripes);
+ current->pid % map->num_stripes,
+ dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (rw & (REQ_WRITE | REQ_DISCARD)) {
+ if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripe_index = do_div(stripe_nr, factor);
stripe_index *= map->sub_stripes;
- if (rw & REQ_WRITE)
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
num_stripes = map->sub_stripes;
else if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripe_index += mirror_num - 1;
else {
int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(map, stripe_index,
+ stripe_index = find_live_mirror(fs_info, map,
+ stripe_index,
map->sub_stripes, stripe_index +
- current->pid % map->sub_stripes);
+ current->pid % map->sub_stripes,
+ dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
} else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
BUG_ON(stripe_index >= map->num_stripes);
- bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+ num_alloc_stripes = num_stripes;
+ if (dev_replace_is_ongoing) {
+ if (rw & (REQ_WRITE | REQ_DISCARD))
+ num_alloc_stripes <<= 1;
+ if (rw & REQ_GET_READ_MIRRORS)
+ num_alloc_stripes++;
+ }
+ bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
if (!bbio) {
ret = -ENOMEM;
goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
}
- if (rw & REQ_WRITE) {
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
}
+ if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+ dev_replace->tgtdev != NULL) {
+ int index_where_to_add;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk
+ * to the new disk takes place at run time while the
+ * filesystem is mounted writable, the regular write
+ * operations to the old disk have to be duplicated to go
+ * to the new disk as well.
+ * Note that device->missing is handled by the caller, and
+ * that the write to the old disk is already set up in the
+ * stripes array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ index_where_to_add++;
+ max_errors++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+ dev_replace->tgtdev != NULL) {
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can
+ * also be used to read data in case it is needed to repair
+ * a corrupt block elsewhere. This is possible if the
+ * requested area is left of the left cursor. In this area,
+ * the target drive is a full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it
+ * simple, only add the mirror with the
+ * lowest physical address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ u64 length = map->stripe_len;
+
+ if (physical_of_found + length <=
+ dev_replace->cursor_left) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+
+ num_stripes++;
+ }
+ }
+ }
+
*bbio_ret = bbio;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
+
+ /*
+ * this is the case that REQ_READ && dev_replace_is_ongoing &&
+ * mirror_num == num_stripes + 1 && dev_replace target drive is
+ * available as a mirror
+ */
+ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+ WARN_ON(num_stripes > 1);
+ bbio->stripes[0].dev = dev_replace->tgtdev;
+ bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bbio->mirror_num = map->num_stripes + 1;
+ }
out:
+ if (dev_replace_is_ongoing)
+ btrfs_dev_replace_unlock(dev_replace);
free_extent_map(em);
return ret;
}
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
- return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
mirror_num);
}
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
&device->work);
}
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+ sector_t sector)
+{
+ struct bio_vec *prev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned short max_sectors = queue_max_sectors(q);
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bdev,
+ .bi_sector = sector,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (bio->bi_vcnt == 0) {
+ WARN_ON(1);
+ return 1;
+ }
+
+ prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ if ((bio->bi_size >> 9) > max_sectors)
+ return 0;
+
+ if (!q->merge_bvec_fn)
+ return 1;
+
+ bvm.bi_size = bio->bi_size - prev->bv_len;
+ if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+ return 0;
+ return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+ struct bio *bio, u64 physical, int dev_nr,
+ int rw, int async)
+{
+ struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+ bio->bi_private = bbio;
+ bio->bi_private = merge_stripe_index_into_bio_private(
+ bio->bi_private, (unsigned int)dev_nr);
+ bio->bi_end_io = btrfs_end_bio;
+ bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+ {
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+ "(%s id %llu), size=%u\n", rw,
+ (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+ name->str, dev->devid, bio->bi_size);
+ rcu_read_unlock();
+ }
+#endif
+ bio->bi_bdev = dev->bdev;
+ if (async)
+ schedule_bio(root, dev, rw, bio);
+ else
+ btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+ struct bio *first_bio, struct btrfs_device *dev,
+ int dev_nr, int rw, int async)
+{
+ struct bio_vec *bvec = first_bio->bi_io_vec;
+ struct bio *bio;
+ int nr_vecs = bio_get_nr_vecs(dev->bdev);
+ u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+ bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+ if (!bio)
+ return -ENOMEM;
+
+ while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+ if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset) < bvec->bv_len) {
+ u64 len = bio->bi_size;
+
+ atomic_inc(&bbio->stripes_pending);
+ submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+ rw, async);
+ physical += len;
+ goto again;
+ }
+ bvec++;
+ }
+
+ submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+ return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+ atomic_inc(&bbio->error);
+ if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ bio->bi_bdev = (struct block_device *)
+ (unsigned long)bbio->mirror_num;
+ bio->bi_sector = logical >> 9;
+ kfree(bbio);
+ bio_endio(bio, -EIO);
+ }
+}
+
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int mirror_num, int async_submit)
{
- struct btrfs_mapping_tree *map_tree;
struct btrfs_device *dev;
struct bio *first_bio = bio;
u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
struct btrfs_bio *bbio = NULL;
length = bio->bi_size;
- map_tree = &root->fs_info->mapping_tree;
map_length = length;
- ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+ ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num);
- if (ret) /* -ENOMEM */
+ if (ret)
return ret;
total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
while (dev_nr < total_devs) {
+ dev = bbio->stripes[dev_nr].dev;
+ if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+ bbio_error(bbio, first_bio, logical);
+ dev_nr++;
+ continue;
+ }
+
+ /*
+ * Check and see if we're ok with this bio based on it's size
+ * and offset with the given device.
+ */
+ if (!bio_size_ok(dev->bdev, first_bio,
+ bbio->stripes[dev_nr].physical >> 9)) {
+ ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+ dev_nr, rw, async_submit);
+ BUG_ON(ret);
+ dev_nr++;
+ continue;
+ }
+
if (dev_nr < total_devs - 1) {
bio = bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
}
- bio->bi_private = bbio;
- bio->bi_private = merge_stripe_index_into_bio_private(
- bio->bi_private, (unsigned int)dev_nr);
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
- dev = bbio->stripes[dev_nr].dev;
- if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
- struct rcu_string *name;
-
- rcu_read_lock();
- name = rcu_dereference(dev->name);
- pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
- "(%s id %llu), size=%u\n", rw,
- (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- name->str, dev->devid, bio->bi_size);
- rcu_read_unlock();
-#endif
- bio->bi_bdev = dev->bdev;
- if (async_submit)
- schedule_bio(root, dev, rw, bio);
- else
- btrfsic_submit_bio(rw, bio);
- } else {
- bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
- bio->bi_sector = logical >> 9;
- bio_endio(bio, -EIO);
- }
+
+ submit_stripe_bio(root, bbio, bio,
+ bbio->stripes[dev_nr].physical, dev_nr, rw,
+ async_submit);
dev_nr++;
}
return 0;
}
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
- cur_devices = root->fs_info->fs_devices;
+ cur_devices = fs_info->fs_devices;
while (cur_devices) {
if (!fsid ||
!memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
+ em->orig_start = 0;
em->block_start = 0;
em->block_len = em->len;
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
- NULL);
+ map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+ uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
kfree(map);
free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+ WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+ device->is_tgtdev_for_dev_replace = 0;
ptr = (unsigned long)btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
return ret;
}
- device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
if (!device || !device->bdev) {
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
- if (device->writeable) {
+ if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
return 0;
}
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+
+ bh = btrfs_read_dev_super(device->bdev);
+ if (!bh)
+ return -EINVAL;
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
int in_fs_metadata;
int missing;
int can_discard;
+ int is_tgtdev_for_dev_replace;
spinlock_t io_lock;
@@ -88,7 +89,7 @@ struct btrfs_device {
u8 uuid[BTRFS_UUID_SIZE];
/* per-device scrub information */
- struct scrub_dev *scrub_device;
+ struct scrub_ctx *scrub_device;
struct btrfs_work work;
struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
u64 total_avail;
};
+struct btrfs_raid_attr {
+ int sub_stripes; /* sub_stripes info for map */
+ int dev_stripes; /* stripes per dev */
+ int devs_max; /* max devs to use */
+ int devs_min; /* min devs needed */
+ int devs_increment; /* ndevs has to be a multiple of this */
+ int ncopies; /* how many copies to data has */
+};
+
struct map_lookup {
u64 type;
int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+ char *device_path,
+ struct btrfs_device **device);
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device);
int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
void btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
*/
if (!value)
goto out;
+ } else {
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+ name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ if (!di && !value)
+ goto out;
+ btrfs_release_path(path);
}
again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
if (verify_dir_item(root, leaf, di))
- continue;
+ goto next;
name_len = btrfs_dir_name_len(leaf, di);
total_size += name_len + 1;