summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c9
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h46
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/delayed-ref.c22
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c570
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/export.c6
-rw-r--r--fs/btrfs/extent-tree.c561
-rw-r--r--fs/btrfs/extent_io.c78
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c6
-rw-r--r--fs/btrfs/file.c97
-rw-r--r--fs/btrfs/free-space-cache.c351
-rw-r--r--fs/btrfs/free-space-cache.h9
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c227
-rw-r--r--fs/btrfs/ioctl.c58
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/math.h6
-rw-r--r--fs/btrfs/ordered-data.c14
-rw-r--r--fs/btrfs/props.c2
-rw-r--r--fs/btrfs/qgroup.c348
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid56.c16
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/send.c83
-rw-r--r--fs/btrfs/super.c31
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/sysfs.h22
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c54
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c396
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c155
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c69
-rw-r--r--fs/btrfs/zlib.c2
48 files changed, 2373 insertions, 1058 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4dabeb893b7c..df9932b00d08 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper);
BTRFS_WORK_HELPER(scrubnc_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -132,7 +132,7 @@ static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
- int flags,
+ unsigned int flags,
int max_active,
int thresh)
{
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e386c29ef1f6..ec2ee477f8ba 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
- int flags,
+ unsigned int flags,
int max_active,
int thresh);
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f55721ff9385..9de772ee0031 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
struct ulist *roots = NULL;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = {};
+ struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
tmp = ulist_alloc(GFP_NOFS);
@@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
*/
struct btrfs_key location;
- /* Lock for counters */
+ /*
+ * Lock for counters and all fields used to determine if the inode is in
+ * the log or not (last_trans, last_sub_trans, last_log_commit,
+ * logged_trans).
+ */
spinlock_t lock;
/* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
+ int ret = 0;
+
+ spin_lock(&BTRFS_I(inode)->lock);
if (BTRFS_I(inode)->logged_trans == generation &&
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
*/
smp_mb();
if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
- return 1;
+ ret = 1;
}
- return 0;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return ret;
}
#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..ce7dec88f4b8 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
(unsigned long long)bio->bi_iter.bi_sector,
dev_bytenr, bio->bi_bdev);
- mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
- GFP_NOFS);
+ mapped_datav = kmalloc_array(bio->bi_vcnt,
+ sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
@@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root,
mutex_unlock(&btrfsic_mutex);
- if (is_vmalloc_addr(state))
- vfree(state);
- else
- kfree(state);
+ kvfree(state);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..ce62324c78e7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->orig_bio = bio;
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
- cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
+ cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
if (!cb->compressed_pages)
goto fail1;
@@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
-static struct btrfs_compress_op *btrfs_compress_op[] = {
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
&btrfs_lzo_compress,
};
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d181f70caae0..13a4dc0436c9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ struct btrfs_compress_op {
size_t srclen, size_t destlen);
};
-extern struct btrfs_compress_op btrfs_zlib_compress;
-extern struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d67f32e648d..0f11ebc92f02 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (!tree_mod_need_log(fs_info, eb))
return 0;
- tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
if (!tm_list)
return -ENOMEM;
@@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
- tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
flags);
if (!tm_list) {
ret = -ENOMEM;
@@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
return 0;
- tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list)
return -ENOMEM;
@@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
return 0;
nritems = btrfs_header_nritems(eb);
- tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
- GFP_NOFS);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
@@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
- clean_tree_block(trans, root, buf);
+ clean_tree_block(trans, root->fs_info, buf);
*last_ref = 1;
}
return 0;
@@ -1678,7 +1677,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root, blocknr);
+ cur = btrfs_find_tree_block(root->fs_info, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
@@ -1943,7 +1942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
path->nodes[level] = NULL;
- clean_tree_block(trans, root, mid);
+ clean_tree_block(trans, root->fs_info, mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
@@ -1997,7 +1996,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- clean_tree_block(trans, root, right);
+ clean_tree_block(trans, root->fs_info, right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
@@ -2041,7 +2040,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- clean_tree_block(trans, root, mid);
+ clean_tree_block(trans, root->fs_info, mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
@@ -2259,7 +2258,7 @@ static void reada_for_search(struct btrfs_root *root,
search = btrfs_node_blockptr(node, slot);
blocksize = root->nodesize;
- eb = btrfs_find_tree_block(root, search);
+ eb = btrfs_find_tree_block(root->fs_info, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2319,7 +2318,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root, block1);
+ eb = btrfs_find_tree_block(root->fs_info, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2332,7 +2331,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root, block2);
+ eb = btrfs_find_tree_block(root->fs_info, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
@@ -2450,7 +2449,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- tmp = btrfs_find_tree_block(root, blocknr);
+ tmp = btrfs_find_tree_block(root->fs_info, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -3126,7 +3125,8 @@ again:
* higher levels
*
*/
-static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
@@ -3137,7 +3137,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
+ tree_mod_log_set_node_key(fs_info, t, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -3151,7 +3151,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
@@ -3173,7 +3174,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_item_key(eb, &disk_key, slot);
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
/*
@@ -3692,7 +3693,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
- clean_tree_block(trans, root, left);
+ clean_tree_block(trans, root->fs_info, left);
btrfs_mark_buffer_dirty(right);
@@ -3704,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root, path->nodes[0]);
+ clean_tree_block(trans, root->fs_info, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3928,10 +3929,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
- clean_tree_block(trans, root, right);
+ clean_tree_block(trans, root->fs_info, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -4168,6 +4169,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int mid;
int slot;
struct extent_buffer *right;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int wret;
int split;
@@ -4271,10 +4273,10 @@ again:
btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(right, root->root_key.objectid);
btrfs_set_header_level(right, 0);
- write_extent_buffer(right, root->fs_info->fsid,
+ write_extent_buffer(right, fs_info->fsid,
btrfs_header_fsid(), BTRFS_FSID_SIZE);
- write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+ write_extent_buffer(right, fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
@@ -4297,7 +4299,7 @@ again:
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
@@ -4615,7 +4617,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
item = btrfs_item_nr(slot);
@@ -4716,7 +4718,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4888,7 +4890,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, level + 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
}
@@ -4981,7 +4983,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_level(leaf, 0);
} else {
btrfs_set_path_blocking(path);
- clean_tree_block(trans, root, leaf);
+ clean_tree_block(trans, root->fs_info, leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
@@ -4990,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9c89cae39ee..6f364e1d8d3d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1061,6 +1061,12 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+#define BTRFS_QGROUP_LEVEL_SHIFT 48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+ return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
/*
* is subvolume quota turned on?
*/
@@ -1256,6 +1262,20 @@ struct btrfs_caching_control {
atomic_t count;
};
+struct btrfs_io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_root *root;
+ struct inode *inode;
+ unsigned long size;
+ int index;
+ int num_pages;
+ int entries;
+ int bitmaps;
+ unsigned check_crcs:1;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache {
/* For dirty block groups */
struct list_head dirty_list;
+ struct list_head io_list;
+
+ struct btrfs_io_ctl io_ctl;
};
/* delayed seq elem */
@@ -1329,6 +1352,8 @@ struct seq_list {
u64 seq;
};
+#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -1472,6 +1497,12 @@ struct btrfs_fs_info {
struct mutex chunk_mutex;
struct mutex volume_mutex;
+ /*
+ * this is taken to make sure we don't set block groups ro after
+ * the free space cache has been allocated on them
+ */
+ struct mutex ro_block_group_mutex;
+
/* this is used during read/modify/write to make sure
* no two ios are trying to mod the same stripe at the same
* time
@@ -1513,6 +1544,7 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
+ struct rw_semaphore delayed_iput_sem;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -3295,6 +3327,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
}
/* extent-tree.c */
+
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
@@ -3385,6 +3420,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset, int no_quota);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -3417,7 +3454,7 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
@@ -3486,7 +3524,8 @@ int btrfs_previous_item(struct btrfs_root *root,
int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
@@ -4180,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
- (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+ !btrfs_qgroup_level(rootid)))
return 1;
return 0;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 82f0c7c95474..a2ae42720a6a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1383,7 +1383,7 @@ out:
static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
- struct btrfs_root *root, int nr)
+ struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_async_delayed_work *async_work;
@@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
btrfs_async_run_delayed_root, NULL, NULL);
async_work->nr = nr;
- btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
+ btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
return 0;
}
@@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
void btrfs_balance_delayed_items(struct btrfs_root *root)
{
struct btrfs_delayed_root *delayed_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
delayed_root = btrfs_get_delayed_root(root);
@@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
seq = atomic_read(&delayed_root->items_seq);
- ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
+ ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
if (ret)
return;
@@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
return;
}
- btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
+ btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
}
/* Will return 0 or -ENOMEM */
@@ -1801,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
+
inode->i_version = btrfs_stack_inode_sequence(inode_item);
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6d16bea94e1c..8f8ed7d20bac 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
* existing and update must have the same bytenr
*/
static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
struct btrfs_delayed_ref_node *update)
{
struct btrfs_delayed_ref_head *existing_ref;
struct btrfs_delayed_ref_head *ref;
+ int old_ref_mod;
existing_ref = btrfs_delayed_node_to_head(existing);
ref = btrfs_delayed_node_to_head(update);
@@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
+ old_ref_mod = existing_ref->total_ref_mod;
existing->ref_mod += update->ref_mod;
+ existing_ref->total_ref_mod += update->ref_mod;
+
+ /*
+ * If we are going to from a positive ref mod to a negative or vice
+ * versa we need to make sure to adjust pending_csums accordingly.
+ */
+ if (existing_ref->is_data) {
+ if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+ delayed_refs->pending_csums -= existing->num_bytes;
+ if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+ delayed_refs->pending_csums += existing->num_bytes;
+ }
spin_unlock(&existing_ref->lock);
}
@@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref->is_data = is_data;
head_ref->ref_root = RB_ROOT;
head_ref->processing = 0;
+ head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(&existing->node, ref);
+ update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ if (is_data && count_mod < 0)
+ delayed_refs->pending_csums += num_bytes;
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a764e2340d48..5eb0892396d0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head {
struct rb_node href_node;
struct btrfs_delayed_extent_op *extent_op;
+
+ /*
+ * This is used to track the final ref_mod from all the refs associated
+ * with this head ref, this is not adjusted as delayed refs are run,
+ * this is meant to track if we need to do the csum accounting or not.
+ */
+ int total_ref_mod;
+
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
@@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root {
/* total number of head nodes ready for processing */
unsigned long num_heads_ready;
+ u64 pending_csums;
+
/*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..0573848c7333 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
srcdev = dev_replace->srcdev;
- args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+ args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_dev_replace_status(fs_info, status_args);
progress = status_args->status.progress_1000;
kfree(status_args);
- do_div(progress, 10);
+ progress = div_u64(progress, 10);
printk_in_rcu(KERN_INFO
"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 639f2663ed3f..2ef9a4b72d06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,7 +54,7 @@
#include <asm/cpufeature.h>
#endif
-static struct extent_io_ops btree_extent_io_ops;
+static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result)
* compute the csum for a btree block, and either verify it or write it
* into the csum field of the block.
*/
-static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf,
int verify)
{
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
@@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
offset += cur_len;
}
if (csum_size > sizeof(inline_result)) {
- result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+ result = kzalloc(csum_size, GFP_NOFS);
if (!result)
return 1;
} else {
@@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
printk_ratelimited(KERN_WARNING
"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
"level %d\n",
- root->fs_info->sb->s_id, buf->start,
+ fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
@@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
if (memcmp(raw_disk_sb, result, csum_size))
ret = 1;
-
- if (ret && btrfs_super_generation(disk_sb) < 10) {
- printk(KERN_WARNING
- "BTRFS: super block crcs don't match, older mkfs detected\n");
- ret = 0;
- }
}
if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
@@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
* we only fill in the checksum field in the first page of a multi-page block
*/
-static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
{
u64 start = page_offset(page);
u64 found_start;
@@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
found_start = btrfs_header_bytenr(eb);
if (WARN_ON(found_start != start || !PageUptodate(page)))
return 0;
- csum_tree_block(root, eb, 0);
+ csum_tree_block(fs_info, eb, 0);
return 0;
}
-static int check_tree_block_fsid(struct btrfs_root *root,
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u8 fsid[BTRFS_UUID_SIZE];
int ret = 1;
@@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
ret = -EIO;
goto err;
}
- if (check_tree_block_fsid(root, eb)) {
+ if (check_tree_block_fsid(root->fs_info, eb)) {
printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
@@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(root, eb, 1);
+ ret = csum_tree_block(root->fs_info, eb, 1);
if (ret) {
ret = -EIO;
goto err;
@@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
break;
}
@@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
return 0;
}
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr)
{
- return find_extent_buffer(root->fs_info, bytenr);
+ return find_extent_buffer(fs_info, bytenr);
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -1165,11 +1160,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
}
-void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -2146,6 +2140,267 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
}
}
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ fs_info->scrub_workers_refcnt = 0;
+}
+
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+}
+
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ set_nlink(fs_info->btree_inode, 1);
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ fs_info->btree_inode->i_size = OFFSET_MAX;
+ fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+
+ RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+ extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+ fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
+ btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
+ init_waitqueue_head(&fs_info->replace_wait);
+}
+
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->qgroup_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
+ mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int max_active = fs_info->thread_pool_size;
+ unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
+
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+ /*
+ * a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->extent_workers &&
+ fs_info->qgroup_rescan_workers)) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *log_tree_root;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 bytenr = btrfs_super_log_root(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ printk(KERN_WARNING "BTRFS: log replay required "
+ "on RO media\n");
+ return -EIO;
+ }
+
+ log_tree_root = btrfs_alloc_root(fs_info);
+ if (!log_tree_root)
+ return -ENOMEM;
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, log_tree_root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
+
+ log_tree_root->node = read_tree_block(tree_root, bytenr,
+ fs_info->generation + 1);
+ if (!log_tree_root->node ||
+ !extent_buffer_uptodate(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return -EIO;
+ }
+ /* returns with log_tree_root freed on success */
+ ret = btrfs_recover_log_trees(log_tree_root);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ }
+
+ if (fs_info->sb->s_flags & MS_RDONLY) {
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ struct btrfs_root *root;
+ struct btrfs_key location;
+ int ret;
+
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
+
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(root)) {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->quota_enabled = 1;
+ fs_info->pending_quota_state = 1;
+ fs_info->quota_root = root;
+ }
+
+ location.objectid = BTRFS_UUID_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ return ret;
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->uuid_root = root;
+ }
+
+ return 0;
+}
+
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -2160,21 +2415,12 @@ int open_ctree(struct super_block *sb,
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *tree_root;
- struct btrfs_root *extent_root;
- struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
- struct btrfs_root *dev_root;
- struct btrfs_root *quota_root;
- struct btrfs_root *uuid_root;
- struct btrfs_root *log_tree_root;
int ret;
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
int max_active;
- int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- bool create_uuid_tree;
- bool check_uuid_tree;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2241,11 +2487,12 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
- mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
+ init_rwsem(&fs_info->delayed_iput_sem);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2276,7 +2523,7 @@ int open_ctree(struct super_block *sb,
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
- fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+ fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
spin_lock_init(&fs_info->reada_lock);
@@ -2294,55 +2541,18 @@ int open_ctree(struct super_block *sb,
}
btrfs_init_delayed_root(fs_info->delayed_root);
- mutex_init(&fs_info->scrub_lock);
- atomic_set(&fs_info->scrubs_running, 0);
- atomic_set(&fs_info->scrub_pause_req, 0);
- atomic_set(&fs_info->scrubs_paused, 0);
- atomic_set(&fs_info->scrub_cancel_req, 0);
- init_waitqueue_head(&fs_info->replace_wait);
- init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
#endif
-
- spin_lock_init(&fs_info->balance_lock);
- mutex_init(&fs_info->balance_mutex);
- atomic_set(&fs_info->balance_running, 0);
- atomic_set(&fs_info->balance_pause_req, 0);
- atomic_set(&fs_info->balance_cancel_req, 0);
- fs_info->balance_ctl = NULL;
- init_waitqueue_head(&fs_info->balance_wait_q);
+ btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
- fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- set_nlink(fs_info->btree_inode, 1);
- /*
- * we set the i_size on the btree inode to the max possible int.
- * the real end of the address space is determined by all of
- * the devices in the system
- */
- fs_info->btree_inode->i_size = OFFSET_MAX;
- fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-
- RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
- fs_info->btree_inode->i_mapping);
- BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
- extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-
- BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- set_bit(BTRFS_INODE_DUMMY,
- &BTRFS_I(fs_info->btree_inode)->runtime_flags);
- btrfs_insert_inode_hash(fs_info->btree_inode);
+ btrfs_init_btree_inode(fs_info, tree_root);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2363,26 +2573,14 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
- fs_info->dev_replace.lock_owner = 0;
- atomic_set(&fs_info->dev_replace.nesting_level, 0);
- mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
- mutex_init(&fs_info->dev_replace.lock_management_lock);
- mutex_init(&fs_info->dev_replace.lock);
- spin_lock_init(&fs_info->qgroup_lock);
- mutex_init(&fs_info->qgroup_ioctl_lock);
- fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_op_tree = RB_ROOT;
- INIT_LIST_HEAD(&fs_info->dirty_qgroups);
- fs_info->qgroup_seq = 1;
- fs_info->quota_enabled = 0;
- fs_info->pending_quota_state = 0;
- fs_info->qgroup_ulist = NULL;
- mutex_init(&fs_info->qgroup_rescan_lock);
+ btrfs_init_dev_replace_locks(fs_info);
+ btrfs_init_qgroup(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2554,75 +2752,9 @@ int open_ctree(struct super_block *sb,
max_active = fs_info->thread_pool_size;
- fs_info->workers =
- btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
- max_active, 16);
-
- fs_info->delalloc_workers =
- btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
-
- fs_info->flush_workers =
- btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
-
- fs_info->caching_workers =
- btrfs_alloc_workqueue("cache", flags, max_active, 0);
-
- /*
- * a higher idle thresh on the submit workers makes it much more
- * likely that bios will be send down in a sane order to the
- * devices
- */
- fs_info->submit_workers =
- btrfs_alloc_workqueue("submit", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 64);
-
- fs_info->fixup_workers =
- btrfs_alloc_workqueue("fixup", flags, 1, 0);
-
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
- fs_info->endio_workers =
- btrfs_alloc_workqueue("endio", flags, max_active, 4);
- fs_info->endio_meta_workers =
- btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
- fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
- fs_info->endio_repair_workers =
- btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue("rmw", flags, max_active, 2);
- fs_info->endio_write_workers =
- btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
- fs_info->endio_freespace_worker =
- btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
- fs_info->delayed_workers =
- btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue("readahead", flags, max_active, 2);
- fs_info->qgroup_rescan_workers =
- btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
- fs_info->extent_workers =
- btrfs_alloc_workqueue("extent-refs", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 8);
-
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->submit_workers && fs_info->flush_workers &&
- fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
- fs_info->endio_repair_workers &&
- fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
- fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->extent_workers &&
- fs_info->qgroup_rescan_workers)) {
- err = -ENOMEM;
+ ret = btrfs_init_workqueues(fs_info, fs_devices);
+ if (ret) {
+ err = ret;
goto fail_sb_buffer;
}
@@ -2688,7 +2820,7 @@ int open_ctree(struct super_block *sb,
* keep the device that is marked to be the target device for the
* dev_replace procedure
*/
- btrfs_close_extra_devices(fs_info, fs_devices, 0);
+ btrfs_close_extra_devices(fs_devices, 0);
if (!fs_devices->latest_bdev) {
printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
@@ -2714,61 +2846,9 @@ retry_root_backup:
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = 0;
-
- extent_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(extent_root)) {
- ret = PTR_ERR(extent_root);
- goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
- fs_info->extent_root = extent_root;
-
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
- dev_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(dev_root)) {
- ret = PTR_ERR(dev_root);
- goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
- fs_info->dev_root = dev_root;
- btrfs_init_devices_late(fs_info);
-
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- csum_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(csum_root)) {
- ret = PTR_ERR(csum_root);
+ ret = btrfs_read_roots(fs_info, tree_root);
+ if (ret)
goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
- fs_info->csum_root = csum_root;
-
- location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
- quota_root = btrfs_read_tree_root(tree_root, &location);
- if (!IS_ERR(quota_root)) {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
- fs_info->quota_enabled = 1;
- fs_info->pending_quota_state = 1;
- fs_info->quota_root = quota_root;
- }
-
- location.objectid = BTRFS_UUID_TREE_OBJECTID;
- uuid_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(uuid_root)) {
- ret = PTR_ERR(uuid_root);
- if (ret != -ENOENT)
- goto recovery_tree_root;
- create_uuid_tree = true;
- check_uuid_tree = false;
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
- fs_info->uuid_root = uuid_root;
- create_uuid_tree = false;
- check_uuid_tree =
- generation != btrfs_super_uuid_tree_generation(disk_super);
- }
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
@@ -2792,7 +2872,7 @@ retry_root_backup:
goto fail_block_groups;
}
- btrfs_close_extra_devices(fs_info, fs_devices, 1);
+ btrfs_close_extra_devices(fs_devices, 1);
ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
@@ -2806,7 +2886,7 @@ retry_root_backup:
goto fail_sysfs;
}
- ret = btrfs_read_block_groups(extent_root);
+ ret = btrfs_read_block_groups(fs_info->extent_root);
if (ret) {
printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
goto fail_sysfs;
@@ -2864,48 +2944,11 @@ retry_root_backup:
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0) {
- u64 bytenr = btrfs_super_log_root(disk_super);
-
- if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
- err = -EIO;
- goto fail_qgroup;
- }
-
- log_tree_root = btrfs_alloc_root(fs_info);
- if (!log_tree_root) {
- err = -ENOMEM;
- goto fail_qgroup;
- }
-
- __setup_root(nodesize, sectorsize, stripesize,
- log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
- log_tree_root->node = read_tree_block(tree_root, bytenr,
- generation + 1);
- if (!log_tree_root->node ||
- !extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
- goto fail_qgroup;
- }
- /* returns with log_tree_root freed on success */
- ret = btrfs_recover_log_trees(log_tree_root);
+ ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
- "Failed to recover log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ err = ret;
goto fail_qgroup;
}
-
- if (sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
- if (ret)
- goto fail_qgroup;
- }
}
ret = btrfs_find_orphan_roots(tree_root);
@@ -2966,7 +3009,7 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
- if (create_uuid_tree) {
+ if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
@@ -2975,8 +3018,9 @@ retry_root_backup:
close_ctree(tree_root);
return ret;
}
- } else if (check_uuid_tree ||
- btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+ } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+ fs_info->generation !=
+ btrfs_super_uuid_tree_generation(disk_super)) {
pr_info("BTRFS: checking UUID tree\n");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3668,7 +3712,7 @@ void close_ctree(struct btrfs_root *root)
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
- btrfs_err(root->fs_info, "commit super ret %d", ret);
+ btrfs_err(fs_info, "commit super ret %d", ret);
}
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3680,10 +3724,10 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 2;
smp_mb();
- btrfs_free_qgroup_config(root->fs_info);
+ btrfs_free_qgroup_config(fs_info);
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
- btrfs_info(root->fs_info, "at unmount delalloc count %lld",
+ btrfs_info(fs_info, "at unmount delalloc count %lld",
percpu_counter_sum(&fs_info->delalloc_bytes));
}
@@ -3723,7 +3767,7 @@ void close_ctree(struct btrfs_root *root)
btrfs_free_stripe_hash_table(fs_info);
- btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ __btrfs_free_block_rsv(root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
lock_chunks(root);
@@ -4134,7 +4178,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start);
+ eb = btrfs_find_tree_block(root->fs_info, start);
start += root->nodesize;
if (!eb)
continue;
@@ -4285,7 +4329,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
-static struct extent_io_ops btree_extent_io_ops = {
+static const struct extent_io_ops btree_extent_io_ops = {
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0fd236..d4cbfeeeedd4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf);
+ struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
@@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 37d164540c3a..8d052209f473 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
static struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = child->d_inode;
+ struct inode *dir = d_inode(child);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -220,8 +220,8 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = child->d_inode;
- struct inode *dir = parent->d_inode;
+ struct inode *inode = d_inode(child);
+ struct inode *dir = d_inode(parent);
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode_ref *iref;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8b353ad02f03..7effed6f2fa6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* list before we release it.
*/
if (btrfs_delayed_ref_is_head(ref)) {
+ if (locked_ref->is_data &&
+ locked_ref->total_ref_mod < 0) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= ref->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ }
btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
}
@@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
*/
spin_lock(&delayed_refs->lock);
avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
- avg = div64_u64(avg, 4);
- fs_info->avg_delayed_ref_runtime = avg;
+ fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
spin_unlock(&delayed_refs->lock);
}
return 0;
@@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
* We don't ever fill up leaves all the way so multiply by 2 just to be
* closer to what we're really going to want to ouse.
*/
- return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+ u64 csum_size;
+ u64 num_csums_per_leaf;
+ u64 num_csums;
+
+ csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ num_csums_per_leaf = div64_u64(csum_size,
+ (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+ num_csums = div64_u64(csum_bytes, root->sectorsize);
+ num_csums += num_csums_per_leaf - 1;
+ num_csums = div64_u64(num_csums, num_csums_per_leaf);
+ return num_csums;
}
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
{
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
- u64 num_bytes;
+ u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+ u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
if (num_heads > 1)
num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
+ num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+ num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+ num_dirty_bgs);
global_rsv = &root->fs_info->global_block_rsv;
/*
* If we can't allocate any more chunks lets make sure we have _lots_ of
* wiggle room since running delayed refs can create more delayed refs.
*/
- if (global_rsv->space_info->full)
+ if (global_rsv->space_info->full) {
+ num_dirty_bgs_bytes <<= 1;
num_bytes <<= 1;
+ }
spin_lock(&global_rsv->lock);
- if (global_rsv->reserved <= num_bytes)
+ if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
ret = 1;
spin_unlock(&global_rsv->lock);
return ret;
@@ -3147,10 +3178,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
fail:
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_release_path(path);
return ret;
}
@@ -3193,7 +3222,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct inode *inode = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- int num_pages = 0;
+ u64 num_pages = 0;
int retries = 0;
int ret = 0;
@@ -3267,15 +3296,14 @@ again:
if (ret)
goto out_put;
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret)
goto out_put;
}
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE) ||
- block_group->delalloc_bytes) {
+ !btrfs_test_opt(root, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3293,14 +3321,14 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages);
+ ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
if (ret)
goto out_put;
@@ -3351,16 +3379,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS. This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO. There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path = NULL;
+ LIST_HEAD(dirty);
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+ int loops = 0;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cur_trans->dirty_bgs)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ return 0;
+ }
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+ /*
+ * make sure all the block groups on our dirty list actually
+ * exist
+ */
+ btrfs_create_pending_block_groups(trans, root);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ /*
+ * cache_write_mutex is here only to save us from balance or automatic
+ * removal of empty block groups deleting this block group while we are
+ * writing out the cache
+ */
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ while (!list_empty(&dirty)) {
+ cache = list_first_entry(&dirty,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ /*
+ * this can happen if something re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+
+ /*
+ * btrfs_wait_cache_io uses the cache->dirty_list to decide
+ * if it should update the cache_state. Don't delete
+ * until after we wait.
+ *
+ * Since we're not running in the commit critical section
+ * we need the dirty_bgs_lock to protect from update_block_group
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+
+ /*
+ * the cache_write_mutex is protecting
+ * the io_list
+ */
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * Our block group might still be attached to the list
+ * of new block groups in the transaction handle of some
+ * other task (struct btrfs_trans_handle->new_bgs). This
+ * means its block group item isn't yet in the extent
+ * tree. If this happens ignore the error, as we will
+ * try again later in the critical section of the
+ * transaction commit.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &cur_trans->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ }
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+
+ if (ret)
+ break;
+
+ /*
+ * Avoid blocking other tasks for too long. It might even save
+ * us from writing caches for block groups that are going to be
+ * removed.
+ */
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ }
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ /*
+ * go through delayed refs for all the stuff we've just kicked off
+ * and then loop back (just once)
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ if (!ret && loops == 0) {
+ loops++;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ /*
+ * dirty_bgs_lock protects us from concurrent block group
+ * deletes too (not just cache_write_mutex).
+ */
+ if (!list_empty(&dirty)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ goto again;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
+ int should_put;
struct btrfs_path *path;
-
- if (list_empty(&cur_trans->dirty_bgs))
- return 0;
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3376,16 +3576,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
+
+ /*
+ * this can happen if cache_save_setup re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+ /*
+ * don't remove from the dirty list until after we've waited
+ * on any pending IO
+ */
list_del_init(&cache->dirty_list);
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- cache_save_setup(cache, trans, path);
- if (!ret)
- ret = btrfs_run_delayed_refs(trans, root,
- (unsigned long) -1);
- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
- btrfs_write_out_cache(root, trans, cache, path);
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ }
+
+ while (!list_empty(io)) {
+ cache = list_first_entry(io, struct btrfs_block_group_cache,
+ io_list);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path, cache->key.objectid);
btrfs_put_block_group(cache);
}
@@ -3635,19 +3883,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
*/
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 used;
- int ret = 0, committed = 0, alloc_chunk = 1;
+ int ret = 0;
+ int need_commit = 2;
+ int have_pinned_space;
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
if (btrfs_is_free_space_inode(inode)) {
- committed = 1;
+ need_commit = 0;
ASSERT(current->journal_info);
}
@@ -3669,7 +3919,7 @@ again:
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
- if (!data_sinfo->full && alloc_chunk) {
+ if (!data_sinfo->full) {
u64 alloc_target;
data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3697,8 +3947,10 @@ alloc:
if (ret < 0) {
if (ret != -ENOSPC)
return ret;
- else
+ else {
+ have_pinned_space = 1;
goto commit_trans;
+ }
}
if (!data_sinfo)
@@ -3709,26 +3961,39 @@ alloc:
/*
* If we don't have enough pinned space to deal with this
- * allocation don't bother committing the transaction.
+ * allocation, and no removed chunk in current transaction,
+ * don't bother committing the transaction.
*/
- if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
- bytes) < 0)
- committed = 1;
+ have_pinned_space = percpu_counter_compare(
+ &data_sinfo->total_bytes_pinned,
+ used + bytes - data_sinfo->total_bytes);
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
commit_trans:
- if (!committed &&
+ if (need_commit &&
!atomic_read(&root->fs_info->open_ioctl_trans)) {
- committed = 1;
+ need_commit--;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- goto again;
+ if (have_pinned_space >= 0 ||
+ trans->transaction->have_free_bgs ||
+ need_commit > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ /*
+ * make sure that all running delayed iput are
+ * done
+ */
+ down_write(&root->fs_info->delayed_iput_sem);
+ up_write(&root->fs_info->delayed_iput_sem);
+ goto again;
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
}
trace_btrfs_space_reservation(root->fs_info,
@@ -3736,12 +4001,16 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
+ ret = btrfs_qgroup_reserve(root, write_bytes);
+ if (ret)
+ goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
+out:
spin_unlock(&data_sinfo->lock);
- return 0;
+ return ret;
}
/*
@@ -4298,8 +4567,13 @@ out:
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
struct btrfs_fs_info *fs_info, u64 used)
{
- return (used >= div_factor_fine(space_info->total_bytes, 98) &&
- !btrfs_fs_closing(fs_info) &&
+ u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+ if (space_info->bytes_used >= thresh)
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
@@ -4354,10 +4628,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (!btrfs_need_do_async_reclaim(space_info, fs_info,
flush_state))
return;
- } while (flush_state <= COMMIT_TRANS);
-
- if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
- queue_work(system_unbound_wq, work);
+ } while (flush_state < COMMIT_TRANS);
}
void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4700,6 +4971,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ kfree(rsv);
+}
+
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
@@ -4812,10 +5088,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
csum_size * 2;
- num_bytes += div64_u64(data_used + meta_used, 50);
+ num_bytes += div_u64(data_used + meta_used, 50);
if (num_bytes * 3 > meta_used)
- num_bytes = div64_u64(meta_used, 3);
+ num_bytes = div_u64(meta_used, 3);
return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
@@ -4998,8 +5274,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved)
{
btrfs_block_rsv_release(root, rsv, (u64)-1);
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
}
/**
@@ -5066,30 +5340,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
int reserve)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 csum_size;
- int num_csums_per_leaf;
- int num_csums;
- int old_csums;
+ u64 old_csums, num_csums;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
BTRFS_I(inode)->csum_bytes == 0)
return 0;
- old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
if (reserve)
BTRFS_I(inode)->csum_bytes += num_bytes;
else
BTRFS_I(inode)->csum_bytes -= num_bytes;
- csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
- num_csums_per_leaf = (int)div64_u64(csum_size,
- sizeof(struct btrfs_csum_item) +
- sizeof(struct btrfs_disk_key));
- num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
- num_csums = num_csums + num_csums_per_leaf - 1;
- num_csums = num_csums / num_csums_per_leaf;
-
- old_csums = old_csums + num_csums_per_leaf - 1;
- old_csums = old_csums / num_csums_per_leaf;
+ num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
/* No change, no need to reserve more */
if (old_csums == num_csums)
@@ -5163,8 +5425,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5172,8 +5433,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->nodesize);
+ btrfs_qgroup_free(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5290,10 +5550,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
- if (root->fs_info->quota_enabled) {
- btrfs_qgroup_free(root, num_bytes +
- dropped * root->nodesize);
- }
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
@@ -5318,7 +5574,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
if (ret)
return ret;
@@ -5390,14 +5646,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
-
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -5446,6 +5694,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->unused_bgs_lock);
}
}
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ trans->transaction->num_dirty_bgs++;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -6956,15 +7214,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
return -ENOSPC;
}
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_discard_extent(root, start, len, NULL);
-
if (pin)
pin_down_extent(root, cache, start, len, 1);
else {
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
+
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, start, len);
@@ -7095,9 +7353,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
+ btrfs_free_path(path);
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
root->nodesize);
- btrfs_free_path(path);
return ret;
}
@@ -7217,7 +7475,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
- clean_tree_block(trans, root, buf);
+ clean_tree_block(trans, root->fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
@@ -7311,7 +7569,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
* returns the key for the extent through ins, and a tree buffer for
* the first block of the extent through buf.
*
- * returns the tree buffer or NULL.
+ * returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7322,6 +7580,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = root->nodesize;
@@ -7342,13 +7601,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ret = btrfs_reserve_extent(root, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
- if (ret) {
- unuse_block_rsv(root->fs_info, block_rsv, blocksize);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
- BUG_ON(IS_ERR(buf)); /* -ENOMEM */
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_free_reserved;
+ }
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -7358,9 +7618,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
extent_op = btrfs_alloc_delayed_extent_op();
- BUG_ON(!extent_op); /* -ENOMEM */
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -7375,13 +7637,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
- ins.objectid,
- ins.offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
- BUG_ON(ret); /* -ENOMEM */
+ ins.objectid, ins.offset,
+ parent, root_objectid, level,
+ BTRFS_ADD_DELAYED_EXTENT,
+ extent_op, 0);
+ if (ret)
+ goto out_free_delayed;
}
return buf;
+
+out_free_delayed:
+ btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+ free_extent_buffer(buf);
+out_free_reserved:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+ return ERR_PTR(ret);
}
struct walk_control {
@@ -7815,7 +8088,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr);
+ next = btrfs_find_tree_block(root->fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
@@ -8016,7 +8289,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
- clean_tree_block(trans, root, eb);
+ clean_tree_block(trans, root->fs_info, eb);
}
if (eb == root->node) {
@@ -8533,10 +8806,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
+again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
+ /*
+ * we're not allowed to set block groups readonly after the dirty
+ * block groups cache has started writing. If it already started,
+ * back off and let this transaction commit
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (trans->transaction->dirty_bg_run) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans, root);
+
+ ret = btrfs_wait_for_commit(root, transid);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8551,6 +8844,7 @@ out:
alloc_flags = update_block_group_flags(root, cache->flags);
check_system_chunk(trans, root, alloc_flags);
}
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans, root);
return ret;
@@ -8720,7 +9014,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
min_free <<= 1;
} else if (index == BTRFS_RAID_RAID0) {
dev_min = fs_devices->rw_devices;
- do_div(min_free, dev_min);
+ min_free = div64_u64(min_free, dev_min);
}
/* We need to do this so that we can look at pending chunks */
@@ -8992,6 +9286,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->dirty_list);
+ INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@@ -9355,7 +9650,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
inode = lookup_free_space_inode(tree_root, block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * make sure our free spache cache IO is done before remove the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
if (ret) {
@@ -9448,18 +9774,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->dirty_list)) {
- list_del_init(&block_group->dirty_list);
- btrfs_put_block_group(block_group);
+ WARN_ON(1);
+ }
+ if (!list_empty(&block_group->io_list)) {
+ WARN_ON(1);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
-
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->key.offset * factor);
+ }
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
block_group->space_info->disk_total -= block_group->key.offset * factor;
+
spin_unlock(&block_group->space_info->lock);
memcpy(&key, &block_group->key, sizeof(key));
@@ -9647,8 +9984,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ space_info->bytes_pinned -= block_group->pinned;
+ space_info->bytes_readonly += block_group->pinned;
+ percpu_counter_add(&space_info->total_bytes_pinned,
+ -block_group->pinned);
block_group->pinned = 0;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d688cfe5d496..c32d226bfecc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
em_len, flags);
- if (ret)
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out_free;
+ }
}
out_free:
free_extent_map(em);
@@ -4557,36 +4560,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
do {
index--;
page = eb->pages[index];
- if (page && mapped) {
+ if (!page)
+ continue;
+ if (mapped)
spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
/*
- * We do this since we'll remove the pages after we've
- * removed the eb from the radix tree, so we could race
- * and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
- * this eb.
+ * We need to make sure we haven't be attached
+ * to a new eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
- BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- page_cache_release(page);
- }
- spin_unlock(&page->mapping->private_lock);
-
- }
- if (page) {
- /* One for when we alloced the page */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
page_cache_release(page);
}
+
+ if (mapped)
+ spin_unlock(&page->mapping->private_lock);
+
+ /* One for when we alloced the page */
+ page_cache_release(page);
} while (index != 0);
}
@@ -4768,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
+ /*
+ * Lock our eb's refs_lock to avoid races with
+ * free_extent_buffer. When we get our eb it might be flagged
+ * with EXTENT_BUFFER_STALE and another task running
+ * free_extent_buffer might have seen that flag set,
+ * eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+ * of decrementing the extent buffer's reference count twice.
+ * So here we could race and increment the eb's reference count,
+ * clear its stale flag, mark it as dirty and drop our reference
+ * before the other task finishes executing free_extent_buffer,
+ * which would later result in an attempt to free an extent
+ * buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
+ }
mark_extent_buffer_accessed(eb, NULL);
return eb;
}
@@ -4867,6 +4890,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
+ exists = NULL;
/*
* Do this so attach doesn't complain and we need to
@@ -4930,12 +4954,12 @@ again:
return eb;
free_eb:
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
for (i = 0; i < num_pages; i++) {
if (eb->pages[i])
unlock_page(eb->pages[i]);
}
- WARN_ON(!atomic_dec_and_test(&eb->refs));
btrfs_release_extent_buffer(eb);
return exists;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 695b0ccfb755..c668f36898d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,7 +97,7 @@ struct extent_io_tree {
u64 dirty_bytes;
int track_uptodate;
spinlock_t lock;
- struct extent_io_ops *ops;
+ const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..58ece6558430 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
- GFP_NOFS);
+ btrfs_bio->csum_allocated = kmalloc_array(nblocks,
+ csum_size, GFP_NOFS);
if (!btrfs_bio->csum_allocated) {
btrfs_free_path(path);
return -ENOMEM;
@@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
btrfs_truncate_item(root, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(root, path, key);
+ btrfs_set_item_key_safe(root->fs_info, path, key);
} else {
BUG();
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 30982bbd31c3..b072e17479aa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
@@ -32,6 +31,7 @@
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
defrag = rb_entry(node, struct inode_defrag, rb_node);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- if (need_resched()) {
- spin_unlock(&fs_info->defrag_inodes_lock);
- cond_resched();
- spin_lock(&fs_info->defrag_inodes_lock);
- }
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
}
@@ -868,7 +864,7 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
extent_offset += end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1126,7 +1122,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -1160,7 +1156,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
PAGE_CACHE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
- pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
if (ret == -ENOSPC &&
(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC))) {
@@ -1635,8 +1631,8 @@ again:
btrfs_end_write_no_snapshoting(root);
if (only_release_metadata && copied > 0) {
- u64 lockstart = round_down(pos, root->sectorsize);
- u64 lockend = lockstart +
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart +
(dirty_pages << PAGE_CACHE_SHIFT) - 1;
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -1739,27 +1735,19 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
u64 start_pos;
u64 end_pos;
ssize_t num_written = 0;
- ssize_t err = 0;
- size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
- loff_t pos = iocb->ki_pos;
+ ssize_t err;
+ loff_t pos;
+ size_t count;
mutex_lock(&inode->i_mutex);
-
- current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err) {
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
mutex_unlock(&inode->i_mutex);
- goto out;
- }
-
- if (count == 0) {
- mutex_unlock(&inode->i_mutex);
- goto out;
+ return err;
}
- iov_iter_truncate(from, count);
-
+ current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_suid(file);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -1786,6 +1774,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
*/
update_time_for_write(inode);
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
/* Expand hole size to cover write data, preventing empty gap */
@@ -1800,7 +1790,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (file->f_flags & O_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
num_written = __btrfs_direct_write(iocb, from, pos);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
@@ -1815,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ spin_unlock(&BTRFS_I(inode)->lock);
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0)
@@ -1870,7 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
@@ -2168,7 +2160,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
u64 num_bytes;
key.offset = offset;
- btrfs_set_item_key_safe(root, path, &key);
+ btrfs_set_item_key_safe(root->fs_info, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2551,7 +2543,6 @@ static long btrfs_fallocate(struct file *file, int mode,
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
@@ -2576,14 +2567,9 @@ static long btrfs_fallocate(struct file *file, int mode,
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
if (ret)
return ret;
- if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
- if (ret)
- goto out_reserve_fail;
- }
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
@@ -2673,23 +2659,35 @@ static long btrfs_fallocate(struct file *file, int mode,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
-
- if (ret < 0) {
- free_extent_map(em);
- break;
- }
} else if (actual_end > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
/*
* We didn't need to allocate any more space, but we
* still extended the size of the file so we need to
- * update i_size.
+ * update i_size and the inode item.
*/
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end,
+ NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans,
+ root);
+ }
}
free_extent_map(em);
+ if (ret < 0)
+ break;
cur_offset = last_byte;
if (cur_offset >= alloc_end) {
@@ -2701,9 +2699,6 @@ static long btrfs_fallocate(struct file *file, int mode,
&cached_state, GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, alloc_end - alloc_start);
-out_reserve_fail:
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
return ret;
@@ -2806,8 +2801,6 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..9dbe5b548fa6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_mask(inode->i_mapping) &
+ ~(__GFP_FS | __GFP_HIGHMEM));
return inode;
}
@@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
-
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
btrfs_release_path(path);
return ret;
}
+
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
@@ -225,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
struct inode *inode)
{
int ret = 0;
+ struct btrfs_path *path = btrfs_alloc_path();
+
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ if (block_group) {
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ }
+
+ /*
+ * now that we've truncated the cache away, its no longer
+ * setup or written
+ */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ }
+ btrfs_free_path(path);
btrfs_i_size_write(inode, 0);
truncate_pagecache(inode, 0);
@@ -235,15 +264,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
/*
* We don't need an orphan item because truncating the free space cache
* will never be split across transactions.
+ * We don't need to check for -EAGAIN because we're a free space
+ * cache inode
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
if (ret) {
+ mutex_unlock(&trans->transaction->cache_write_mutex);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+
+ if (block_group)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+fail:
if (ret)
btrfs_abort_transaction(trans, root, ret);
@@ -269,18 +306,7 @@ static int readahead_cache(struct inode *inode)
return 0;
}
-struct io_ctl {
- void *cur, *orig;
- struct page *page;
- struct page **pages;
- struct btrfs_root *root;
- unsigned long size;
- int index;
- int num_pages;
- unsigned check_crcs:1;
-};
-
-static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
struct btrfs_root *root, int write)
{
int num_pages;
@@ -296,45 +322,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
(num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
return -ENOSPC;
- memset(io_ctl, 0, sizeof(struct io_ctl));
+ memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
- io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+ io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
io_ctl->num_pages = num_pages;
io_ctl->root = root;
io_ctl->check_crcs = check_crcs;
+ io_ctl->inode = inode;
return 0;
}
-static void io_ctl_free(struct io_ctl *io_ctl)
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
{
kfree(io_ctl->pages);
+ io_ctl->pages = NULL;
}
-static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
{
if (io_ctl->cur) {
- kunmap(io_ctl->page);
io_ctl->cur = NULL;
io_ctl->orig = NULL;
}
}
-static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
{
ASSERT(io_ctl->index < io_ctl->num_pages);
io_ctl->page = io_ctl->pages[io_ctl->index++];
- io_ctl->cur = kmap(io_ctl->page);
+ io_ctl->cur = page_address(io_ctl->page);
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_CACHE_SIZE;
if (clear)
memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
}
-static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
{
int i;
@@ -349,7 +376,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
}
}
-static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int uptodate)
{
struct page *page;
@@ -383,7 +410,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
return 0;
}
-static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
__le64 *val;
@@ -406,7 +433,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
io_ctl->cur += sizeof(u64);
}
-static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
__le64 *gen;
@@ -435,7 +462,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
return 0;
}
-static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp;
u32 crc = ~(u32)0;
@@ -453,13 +480,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
io_ctl_unmap_page(io_ctl);
- tmp = kmap(io_ctl->pages[0]);
+ tmp = page_address(io_ctl->pages[0]);
tmp += index;
*tmp = crc;
- kunmap(io_ctl->pages[0]);
}
-static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp, val;
u32 crc = ~(u32)0;
@@ -473,10 +499,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- tmp = kmap(io_ctl->pages[0]);
+ tmp = page_address(io_ctl->pages[0]);
tmp += index;
val = *tmp;
- kunmap(io_ctl->pages[0]);
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
@@ -492,7 +517,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
return 0;
}
-static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
void *bitmap)
{
struct btrfs_free_space_entry *entry;
@@ -522,7 +547,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
return 0;
}
-static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
{
if (!io_ctl->cur)
return -ENOSPC;
@@ -545,7 +570,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
return 0;
}
-static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
{
/*
* If we're not on the boundary we know we've modified the page and we
@@ -562,7 +587,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
}
}
-static int io_ctl_read_entry(struct io_ctl *io_ctl,
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry, u8 *type)
{
struct btrfs_free_space_entry *e;
@@ -589,7 +614,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
return 0;
}
-static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry)
{
int ret;
@@ -648,7 +673,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct io_ctl io_ctl;
+ struct btrfs_io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
LIST_HEAD(bitmaps);
@@ -877,7 +902,7 @@ out:
}
static noinline_for_stack
-int write_cache_extent_entries(struct io_ctl *io_ctl,
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
int *entries, int *bitmaps,
@@ -885,6 +910,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
{
int ret;
struct btrfs_free_cluster *cluster = NULL;
+ struct btrfs_free_cluster *cluster_locked = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
struct btrfs_trim_range *trim_entry;
@@ -896,6 +922,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
}
if (!node && cluster) {
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
node = rb_first(&cluster->root);
cluster = NULL;
}
@@ -919,9 +947,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
node = rb_next(node);
if (!node && cluster) {
node = rb_first(&cluster->root);
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
cluster = NULL;
}
}
+ if (cluster_locked) {
+ spin_unlock(&cluster_locked->lock);
+ cluster_locked = NULL;
+ }
/*
* Make sure we don't miss any range that was removed from our rbtree
@@ -939,6 +973,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
return 0;
fail:
+ if (cluster_locked)
+ spin_unlock(&cluster_locked->lock);
return -ENOSPC;
}
@@ -1000,7 +1036,7 @@ fail:
static noinline_for_stack int
write_pinned_extent_entries(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
- struct io_ctl *io_ctl,
+ struct btrfs_io_ctl *io_ctl,
int *entries)
{
u64 start, extent_start, extent_end, len;
@@ -1050,7 +1086,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
}
static noinline_for_stack int
-write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
struct list_head *pos, *n;
int ret;
@@ -1083,10 +1119,7 @@ static int flush_dirty_cache(struct inode *inode)
}
static void noinline_for_stack
-cleanup_write_cache_enospc(struct inode *inode,
- struct io_ctl *io_ctl,
- struct extent_state **cached_state,
- struct list_head *bitmap_list)
+cleanup_bitmap_list(struct list_head *bitmap_list)
{
struct list_head *pos, *n;
@@ -1095,12 +1128,85 @@ cleanup_write_cache_enospc(struct inode *inode,
list_entry(pos, struct btrfs_free_space, list);
list_del_init(&entry->list);
}
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct btrfs_io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, cached_state,
GFP_NOFS);
}
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ int ret;
+ struct inode *inode = io_ctl->inode;
+
+ if (!inode)
+ return 0;
+
+ if (block_group)
+ root = root->fs_info->tree_root;
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
+
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ io_ctl->entries, io_ctl->bitmaps);
+out:
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ if (block_group) {
+#ifdef DEBUG
+ btrfs_err(root->fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
+#endif
+ }
+ }
+ btrfs_update_inode(trans, root, inode);
+
+ if (block_group) {
+ /* the dirty list is protected by the dirty_bgs_lock */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+
+ /* the disk_cache_state is protected by the block group lock */
+ spin_lock(&block_group->lock);
+
+ /*
+ * only mark this as written if we didn't get put back on
+ * the dirty list while waiting for IO. Otherwise our
+ * cache state won't be right, and we won't get written again
+ */
+ if (!ret && list_empty(&block_group->dirty_list))
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ else if (ret)
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ io_ctl->inode = NULL;
+ iput(inode);
+ }
+
+ return ret;
+
+}
+
/**
* __btrfs_write_out_cache - write out cached info to an inode
* @root - the root the inode belongs to
@@ -1112,27 +1218,29 @@ cleanup_write_cache_enospc(struct inode *inode,
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
+ * or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 offset)
{
struct extent_state *cached_state = NULL;
- struct io_ctl io_ctl;
LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
+ int must_iput = 0;
if (!i_size_read(inode))
- return -1;
+ return -EIO;
- ret = io_ctl_init(&io_ctl, inode, root, 1);
+ WARN_ON(io_ctl->pages);
+ ret = io_ctl_init(io_ctl, inode, root, 1);
if (ret)
- return -1;
+ return ret;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
down_write(&block_group->data_rwsem);
@@ -1143,55 +1251,59 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
up_write(&block_group->data_rwsem);
BTRFS_I(inode)->generation = 0;
ret = 0;
+ must_iput = 1;
goto out;
}
spin_unlock(&block_group->lock);
}
/* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ if (ret)
+ goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
- io_ctl_set_generation(&io_ctl, trans->transid);
+ io_ctl_set_generation(io_ctl, trans->transid);
mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
- ret = write_cache_extent_entries(&io_ctl, ctl,
+ spin_lock(&ctl->tree_lock);
+ ret = write_cache_extent_entries(io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
- if (ret) {
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out_nospc;
- }
+ if (ret)
+ goto out_nospc_locked;
/*
* Some spaces that are freed in the current transaction are pinned,
* they will be added into free space cache after the transaction is
* committed, we shouldn't lose them.
+ *
+ * If this changes while we are working we'll get added back to
+ * the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
- if (ret) {
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out_nospc;
- }
+ ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
+ if (ret)
+ goto out_nospc_locked;
/*
* At last, we write out all the bitmaps and keep cache_writeout_mutex
* locked while doing it because a concurrent trim can be manipulating
* or freeing the bitmap.
*/
- ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ ret = write_bitmap_entries(io_ctl, &bitmap_list);
+ spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;
/* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(&io_ctl);
+ io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
0, i_size_read(inode), &cached_state);
if (ret)
goto out_nospc;
@@ -1202,30 +1314,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* Release the pages and unlock the extent, we will flush
* them out later
*/
- io_ctl_drop_pages(&io_ctl);
+ io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
- /* Flush the dirty pages in the cache file. */
- ret = flush_dirty_cache(inode);
+ /*
+ * at this point the pages are under IO and we're happy,
+ * The caller is responsible for waiting on them and updating the
+ * the cache and the inode
+ */
+ io_ctl->entries = entries;
+ io_ctl->bitmaps = bitmaps;
+
+ ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
if (ret)
goto out;
- /* Update the cache item to tell everyone this cache file is valid. */
- ret = update_cache_item(trans, root, inode, path, offset,
- entries, bitmaps);
+ return 0;
+
out:
- io_ctl_free(&io_ctl);
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
+ if (must_iput)
+ iput(inode);
return ret;
+out_nospc_locked:
+ cleanup_bitmap_list(&bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
out_nospc:
- cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+ cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -1241,7 +1367,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
- enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@@ -1250,34 +1375,34 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
-
- if (block_group->delalloc_bytes) {
- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
- spin_unlock(&block_group->lock);
- return 0;
- }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+ ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+ &block_group->io_ctl, trans,
path, block_group->key.objectid);
if (ret) {
- dcs = BTRFS_DC_ERROR;
- ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->key.objectid);
#endif
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+
+ block_group->io_ctl.inode = NULL;
+ iput(inode);
}
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = dcs;
- spin_unlock(&block_group->lock);
- iput(inode);
+ /*
+ * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+ * to wait for IO and put the inode
+ */
+
return ret;
}
@@ -1298,11 +1423,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
- u64 bytes_per_bitmap;
+ u32 bytes_per_bitmap;
bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
bitmap_start = offset - ctl->start;
- bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
bitmap_start += ctl->start;
@@ -1521,10 +1646,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
- max_bitmaps = max(max_bitmaps, 1);
+ max_bitmaps = max_t(u32, max_bitmaps, 1);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -1537,7 +1662,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div64_u64(size, 1024 * 1024 * 1024);
+ div_u64(size, 1024 * 1024 * 1024);
/*
* we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1677,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
}
/*
- * we want the extent entry threshold to always be at most 1/2 the maxw
+ * we want the extent entry threshold to always be at most 1/2 the max
* bytes we can have, or whatever is less than that.
*/
extent_bytes = max_bytes - bitmap_bytes;
- extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
ctl->extents_thresh =
- div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1673,7 +1798,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*/
if (*bytes >= align) {
tmp = entry->offset - ctl->start + align - 1;
- do_div(tmp, align);
+ tmp = div64_u64(tmp, align);
tmp = tmp * align + ctl->start;
align_off = tmp - entry->offset;
} else {
@@ -2402,11 +2527,8 @@ static void __btrfs_remove_free_space_cache_locked(
} else {
free_bitmap(ctl, info);
}
- if (need_resched()) {
- spin_unlock(&ctl->tree_lock);
- cond_resched();
- spin_lock(&ctl->tree_lock);
- }
+
+ cond_resched_lock(&ctl->tree_lock);
}
}
@@ -2431,11 +2553,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
- if (need_resched()) {
- spin_unlock(&ctl->tree_lock);
- cond_resched();
- spin_lock(&ctl->tree_lock);
- }
+
+ cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
spin_unlock(&ctl->tree_lock);
@@ -3346,13 +3465,29 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
int ret;
+ struct btrfs_io_ctl io_ctl;
+ bool release_metadata = true;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;
- ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+ memset(&io_ctl, 0, sizeof(io_ctl));
+ ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+ trans, path, 0);
+ if (!ret) {
+ /*
+ * At this point writepages() didn't error out, so our metadata
+ * reservation is released when the writeback finishes, at
+ * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+ * with or without an error.
+ */
+ release_metadata = false;
+ ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+ }
+
if (ret) {
- btrfs_delalloc_release_metadata(inode, inode->i_size);
+ if (release_metadata)
+ btrfs_delalloc_release_metadata(inode, inode->i_size);
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238a0aed..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
struct btrfs_free_space *info);
};
+struct btrfs_io_ctl;
+
struct inode *lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache
*block_group, struct btrfs_path *path);
@@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
struct inode *inode);
int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset);
int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
-
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
struct btrfs_path *path);
int create_free_ino_inode(struct btrfs_root *root,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..f6a596d5a637 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ again:
}
if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret) {
if (ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..8bb013672aee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
-#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -59,6 +59,7 @@
#include "backref.h"
#include "hash.h"
#include "props.h"
+#include "qgroup.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -470,7 +471,7 @@ again:
*/
if (inode_need_compress(inode)) {
WARN_ON(pages);
- pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
goto cont;
@@ -752,7 +753,6 @@ retry:
}
goto out_free;
}
-
/*
* here we're doing allocation and writeback of the
* compressed pages
@@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
if (empty)
return;
+ down_read(&fs_info->delayed_iput_sem);
+
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
iput(delayed->inode);
kfree(delayed);
}
+
+ up_read(&root->fs_info->delayed_iput_sem);
}
/*
@@ -3628,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
* idea about which extents were modified before we were evicted from
* cache.
+ *
+ * This is required for both inode re-read from disk and delayed inode
+ * in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- inode->i_version = btrfs_inode_sequence(leaf, inode_item);
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
- rdev = btrfs_inode_rdev(leaf, inode_item);
-
- BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-
-cache_index:
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -4016,16 +4023,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int ret;
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+ btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
- ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
if (ret)
goto out;
@@ -4124,7 +4131,7 @@ out:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
@@ -4151,7 +4158,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
@@ -4162,6 +4169,21 @@ out:
return err;
}
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytes_deleted)
+{
+ int ret;
+
+ bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+ ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+ bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->bytes_reserved += bytes_deleted;
+ return ret;
+
+}
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -4197,9 +4219,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
u64 ino = btrfs_ino(inode);
+ u64 bytes_deleted = 0;
+ bool be_nice = 0;
+ bool should_throttle = 0;
+ bool should_end = 0;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+ /*
+ * for non-free space inodes and ref cows, we want to back off from
+ * time to time
+ */
+ if (!btrfs_is_free_space_inode(inode) &&
+ test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ be_nice = 1;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4229,6 +4263,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.type = (u8)-1;
search_again:
+ /*
+ * with a 16K leaf size and 128MB extents, you can actually queue
+ * up a huge file in a single leaf. Most of the time that
+ * bytes_deleted is > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (btrfs_should_end_transaction(trans, root)) {
+ err = -EAGAIN;
+ goto error;
+ }
+ }
+
+
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
@@ -4371,22 +4418,39 @@ delete:
} else {
break;
}
+ should_throttle = 0;
+
if (found_extent &&
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
+ bytes_deleted += extent_num_bytes;
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset, 0);
BUG_ON(ret);
+ if (btrfs_should_throttle_delayed_refs(trans, root))
+ btrfs_async_run_delayed_refs(root,
+ trans->delayed_ref_updates * 2, 0);
+ if (be_nice) {
+ if (truncate_space_check(trans, root,
+ extent_num_bytes)) {
+ should_end = 1;
+ }
+ if (btrfs_should_throttle_delayed_refs(trans,
+ root)) {
+ should_throttle = 1;
+ }
+ }
}
if (found_type == BTRFS_INODE_ITEM_KEY)
break;
if (path->slots[0] == 0 ||
- path->slots[0] != pending_del_slot) {
+ path->slots[0] != pending_del_slot ||
+ should_throttle || should_end) {
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -4399,6 +4463,23 @@ delete:
pending_del_nr = 0;
}
btrfs_release_path(path);
+ if (should_throttle) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
+ /*
+ * if we failed to refill our space rsv, bail out
+ * and let the transaction restart
+ */
+ if (should_end) {
+ err = -EAGAIN;
+ goto error;
+ }
goto search_again;
} else {
path->slots[0]--;
@@ -4415,7 +4496,18 @@ error:
if (last_size != (u64)-1 &&
root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
+
btrfs_free_path(path);
+
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
return err;
}
@@ -4826,7 +4918,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
@@ -4924,6 +5016,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
+ int steal_from_global = 0;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
int ret;
@@ -4991,9 +5084,20 @@ void btrfs_evict_inode(struct inode *inode)
* hard as possible to get this to work.
*/
if (ret)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+ steal_from_global++;
+ else
+ steal_from_global = 0;
+ ret = 0;
- if (ret) {
+ /*
+ * steal_from_global == 0: we reserved stuff, hooray!
+ * steal_from_global == 1: we didn't reserve stuff, boo!
+ * steal_from_global == 2: we've committed, still not a lot of
+ * room but maybe we'll have room in the global reserve this
+ * time.
+ * steal_from_global == 3: abandon all hope!
+ */
+ if (steal_from_global > 2) {
btrfs_warn(root->fs_info,
"Could not get space for a delete, will truncate on mount %d",
ret);
@@ -5009,10 +5113,40 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ /*
+ * We can't just steal from the global reserve, we need tomake
+ * sure there is room to do it, if not we need to commit and try
+ * again.
+ */
+ if (steal_from_global) {
+ if (!btrfs_check_space_for_delayed_refs(trans, root))
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+ min_size);
+ else
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Couldn't steal from the global reserve, we have too much
+ * pending stuff built up, commit the transaction and try it
+ * again.
+ */
+ if (ret) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+ continue;
+ } else {
+ steal_from_global = 0;
+ }
+
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -ENOSPC)
+ if (ret != -ENOSPC && ret != -EAGAIN)
break;
trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -5416,10 +5550,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
static int btrfs_dentry_delete(const struct dentry *dentry)
{
struct btrfs_root *root;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (!inode && !IS_ROOT(dentry))
- inode = dentry->d_parent->d_inode;
+ inode = d_inode(dentry->d_parent);
if (inode) {
root = BTRFS_I(inode)->root;
@@ -6226,7 +6360,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
u64 index;
int err;
int drop_inode = 0;
@@ -8081,7 +8215,7 @@ free_ordered:
bio_endio(dio_bio, ret);
}
-static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
const struct iov_iter *iter, loff_t offset)
{
int seg;
@@ -8096,7 +8230,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
goto out;
/* If this is a write we don't need to check anymore */
- if (rw & WRITE)
+ if (iov_iter_rw(iter) == WRITE)
return 0;
/*
* Check to make sure we don't have duplicate iov_base's in this
@@ -8114,8 +8248,8 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -8126,10 +8260,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
+ if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
return 0;
- atomic_inc(&inode->i_dio_count);
+ inode_dio_begin(inode);
smp_mb__after_atomic();
/*
@@ -8144,7 +8278,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
filemap_fdatawrite_range(inode->i_mapping, offset,
offset + count - 1);
- if (rw & WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
@@ -8169,16 +8303,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
current->journal_info = &outstanding_extents;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
- inode_dio_done(inode);
+ inode_dio_end(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iter, offset, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (rw & WRITE) {
+ ret = __blockdev_direct_IO(iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iter, offset, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
@@ -8188,7 +8322,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
}
out:
if (wakeup)
- inode_dio_done(inode);
+ inode_dio_end(inode);
if (relock)
mutex_lock(&inode->i_mutex);
@@ -8581,7 +8715,7 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
- if (ret != -ENOSPC) {
+ if (ret != -ENOSPC && ret != -EAGAIN) {
err = ret;
break;
}
@@ -8875,7 +9009,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
u64 delalloc_bytes;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
u32 blocksize = inode->i_sb->s_blocksize;
generic_fillattr(inode, stat);
@@ -8896,8 +9030,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
- struct inode *new_inode = new_dentry->d_inode;
- struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *old_inode = d_inode(old_dentry);
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
u64 root_objectid;
@@ -9009,7 +9143,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_name.len);
} else {
ret = __btrfs_unlink_inode(trans, root, old_dir,
- old_dentry->d_inode,
+ d_inode(old_dentry),
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
@@ -9033,12 +9167,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, new_dir,
- new_dentry->d_inode,
+ d_inode(new_dentry),
new_dentry->d_name.name,
new_dentry->d_name.len);
}
if (!ret && new_inode->i_nlink == 0)
- ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+ ret = btrfs_orphan_add(trans, d_inode(new_dentry));
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_fail;
@@ -9451,6 +9585,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans, root);
break;
}
+
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74609b931ba5..1c22c6518504 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
return ret;
+ /*
+ * Don't create subvolume whose level is not zero. Or qgroup will be
+ * screwed up since it assume subvolme qgroup's level to be 0.
+ */
+ if (btrfs_qgroup_level(objectid))
+ return -ENOSPC;
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* The same as the snapshot creation, please see the comment
@@ -717,7 +724,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
- inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+ inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
@@ -761,10 +768,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
{
int error;
- if (!victim->d_inode)
+ if (d_really_is_negative(victim))
return -ENOENT;
- BUG_ON(victim->d_parent->d_inode != dir);
+ BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -772,8 +779,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
- IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+ if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
+ IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
@@ -792,7 +799,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
{
- if (child->d_inode)
+ if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
@@ -810,7 +817,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = parent->dentry->d_inode;
+ struct inode *dir = d_inode(parent->dentry);
struct dentry *dentry;
int error;
@@ -824,7 +831,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_unlock;
error = -EEXIST;
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
goto out_dput;
error = btrfs_may_create(dir, dentry);
@@ -1564,7 +1571,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- do_div(new_size, root->sectorsize);
+ new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
@@ -2294,7 +2301,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
{
struct dentry *parent = file->f_path.dentry;
struct dentry *dentry;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
@@ -2333,12 +2340,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_unlock_dir;
}
- if (!dentry->d_inode) {
+ if (d_really_is_negative(dentry)) {
err = -ENOENT;
goto out_dput;
}
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
dest = BTRFS_I(inode)->root;
if (!capable(CAP_SYS_ADMIN)) {
/*
@@ -2403,7 +2410,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
"Attempt to delete subvolume %llu during send",
dest->root_key.objectid);
err = -EPERM;
- goto out_dput;
+ goto out_unlock_inode;
}
d_invalidate(dentry);
@@ -2498,6 +2505,7 @@ out_up_write:
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
}
+out_unlock_inode:
mutex_unlock(&inode->i_mutex);
if (!err) {
shrink_dcache_sb(root->fs_info->sb);
@@ -2897,6 +2905,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
if (src == dst)
return -EINVAL;
+ if (len == 0)
+ return 0;
+
btrfs_double_lock(src, loff, dst, dst_loff, len);
ret = extent_same_check_offsets(src, loff, len);
@@ -3039,7 +3050,7 @@ out:
static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 disko)
{
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
struct ulist *roots;
struct ulist_iterator uiter;
struct ulist_node *root_node = NULL;
@@ -3202,6 +3213,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
+ u64 next_key_min_offset = key.offset + 1;
+
/*
* note the key will change type as we walk through the
* tree.
@@ -3282,7 +3295,7 @@ process_slot:
} else if (key.offset >= off + len) {
break;
}
-
+ next_key_min_offset = key.offset + datal;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot),
@@ -3497,7 +3510,7 @@ process_slot:
break;
}
btrfs_release_path(path);
- key.offset++;
+ key.offset = next_key_min_offset;
}
ret = 0;
@@ -3626,6 +3639,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
/* verify the end result is block aligned */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
!IS_ALIGNED(destoff, bs))
@@ -4624,6 +4642,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
sa->src, sa->dst);
}
+ /* update qgroup status and info */
+ err = btrfs_run_qgroups(trans, root->fs_info);
+ if (err < 0)
+ btrfs_error(root->fs_info, ret,
+ "failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
ret = err;
@@ -4669,8 +4692,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
/* FIXME: check if the IDs really exist */
if (sa->create) {
- ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
- NULL);
+ ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
} else {
ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 617553cdb7d3..a2f051347731 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -434,7 +434,7 @@ out:
return ret;
}
-struct btrfs_compress_op btrfs_lzo_compress = {
+const struct btrfs_compress_op btrfs_lzo_compress = {
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
if (factor == 10)
return num;
num *= factor;
- do_div(num, 10);
- return num;
+ return div_u64(num, 10);
}
static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
if (factor == 100)
return num;
num *= factor;
- do_div(num, 100);
- return num;
+ return div_u64(num, 100);
}
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 157cc54fc634..760c4a5e096b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
int ret = 0;
+ int ret_wb = 0;
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
@@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (ret)
return ret;
- ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
- if (ret)
- return ret;
+ /*
+ * If we have a writeback error don't return immediately. Wait first
+ * for any ordered extents that haven't completed yet. This is to make
+ * sure no one can dirty the same page ranges and call writepages()
+ * before the ordered extents complete - to avoid failures (-EEXIST)
+ * when adding the new ordered extents to the ordered tree.
+ */
+ ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
while (1) {
@@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
break;
end--;
}
- return ret;
+ return ret_wb ? ret_wb : ret;
}
/*
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 129b1dd28527..dca137b04095 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode)
return NULL;
}
+
+
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 058c79eecbfb..3d6546581bb9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -644,9 +644,8 @@ out:
}
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 qgroupid,
- u64 flags, u64 max_rfer, u64 max_excl,
- u64 rsv_rfer, u64 rsv_excl)
+ struct btrfs_root *root,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
key.objectid = 0;
key.type = BTRFS_QGROUP_LIMIT_KEY;
- key.offset = qgroupid;
+ key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
if (!path)
@@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
- btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
- btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
- btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
- btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
- btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+ btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+ btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+ btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+ btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+ btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
btrfs_mark_buffer_dirty(l);
@@ -967,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
fs_info->pending_quota_state = 0;
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
list_del(&quota_root->dirty_list);
btrfs_tree_lock(quota_root->node);
- clean_tree_block(trans, tree_root, quota_root->node);
+ clean_tree_block(trans, tree_root->fs_info, quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
@@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 ref_root,
+ u64 num_bytes, int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 src, u64 dst,
+ int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ int ret = 1;
+ int err = 0;
+
+ qgroup = find_qgroup_rb(fs_info, src);
+ if (!qgroup)
+ goto out;
+ if (qgroup->excl == qgroup->rfer) {
+ ret = 0;
+ err = __qgroup_excl_accounting(fs_info, tmp, dst,
+ qgroup->excl, sign);
+ if (err < 0) {
+ ret = err;
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
@@ -1008,8 +1112,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
int ret = 0;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ /* Check the level of src and dst first */
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ return -EINVAL;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -1043,23 +1156,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->qgroup_lock);
ret = add_relation_rb(quota_root->fs_info, src, dst);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+ }
+ ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ ulist_free(tmp);
return ret;
}
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
int ret = 0;
int err;
- mutex_lock(&fs_info->qgroup_ioctl_lock);
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
@@ -1088,14 +1211,27 @@ exist:
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
out:
+ ulist_free(tmp);
+ return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ ret = __del_qgroup_relation(trans, fs_info, src, dst);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
return ret;
}
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+ struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -1133,6 +1269,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1147,15 +1284,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
} else {
- /* check if there are no relations to this qgroup */
- if (!list_empty(&qgroup->groups) ||
- !list_empty(&qgroup->members)) {
+ /* check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
goto out;
}
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ ret = __del_qgroup_relation(trans, fs_info,
+ qgroupid,
+ list->group->qgroupid);
+ if (ret)
+ goto out;
+ }
+
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
@@ -1184,23 +1330,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
}
- ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
- limit->flags, limit->max_rfer,
- limit->max_excl, limit->rsv_rfer,
- limit->rsv_excl);
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
+ qgroup->max_rfer = limit->max_rfer;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+ qgroup->max_excl = limit->max_excl;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
+ qgroup->rsv_excl = limit->rsv_excl;
+ qgroup->lim_flags |= limit->flags;
+
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
if (ret) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
- spin_lock(&fs_info->qgroup_lock);
- qgroup->lim_flags = limit->flags;
- qgroup->max_rfer = limit->max_rfer;
- qgroup->max_excl = limit->max_excl;
- qgroup->rsv_rfer = limit->rsv_rfer;
- qgroup->rsv_excl = limit->rsv_excl;
- spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1256,14 +1406,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
return -1;
if (oper1->bytenr > oper2->bytenr)
return 1;
- if (oper1->seq < oper2->seq)
- return -1;
- if (oper1->seq > oper2->seq)
- return 1;
if (oper1->ref_root < oper2->ref_root)
return -1;
if (oper1->ref_root > oper2->ref_root)
return 1;
+ if (oper1->seq < oper2->seq)
+ return -1;
+ if (oper1->seq > oper2->seq)
+ return 1;
if (oper1->type < oper2->type)
return -1;
if (oper1->type > oper2->type)
@@ -1372,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
- * exclusive counts adjusted.
- */
static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_operation *oper)
{
- struct btrfs_qgroup *qgroup;
struct ulist *tmp;
- struct btrfs_qgroup_list *glist;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
int sign = 0;
int ret = 0;
@@ -1395,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->qgroup_lock);
if (!fs_info->quota_root)
goto out;
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
+
switch (oper->type) {
case BTRFS_QGROUP_OPER_ADD_EXCL:
sign = 1;
@@ -1408,43 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
default:
ASSERT(0);
}
- qgroup->rfer += sign * oper->num_bytes;
- qgroup->rfer_cmpr += sign * oper->num_bytes;
-
- WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
- qgroup->excl += sign * oper->num_bytes;
- qgroup->excl_cmpr += sign * oper->num_bytes;
-
- qgroup_dirty(fs_info, qgroup);
-
- /* Get all of the parent groups that contain this qgroup */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
-
- /* Iterate all of the parents and adjust their reference counts */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = u64_to_ptr(unode->aux);
- qgroup->rfer += sign * oper->num_bytes;
- qgroup->rfer_cmpr += sign * oper->num_bytes;
- WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
- qgroup->excl += sign * oper->num_bytes;
- qgroup->excl_cmpr += sign * oper->num_bytes;
- qgroup_dirty(fs_info, qgroup);
-
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
- }
- ret = 0;
+ ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
+ oper->num_bytes, sign);
out:
spin_unlock(&fs_info->qgroup_lock);
ulist_free(tmp);
@@ -1845,7 +1949,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
struct ulist *roots = NULL;
struct ulist *qgroups, *tmp;
struct btrfs_qgroup *qgroup;
- struct seq_list elem = {};
+ struct seq_list elem = SEQ_LIST_INIT(elem);
u64 seq;
int old_roots = 0;
int new_roots = 0;
@@ -1967,7 +2071,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
int err;
struct btrfs_qgroup *qg;
u64 root_obj = 0;
- struct seq_list elem = {};
+ struct seq_list elem = SEQ_LIST_INIT(elem);
parents = ulist_alloc(GFP_NOFS);
if (!parents)
@@ -2156,6 +2260,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+ if (ret)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
spin_lock(&fs_info->qgroup_lock);
}
if (fs_info->quota_enabled)
@@ -2219,6 +2327,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
+
+ if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+ ret = -EINVAL;
+ goto out;
+ }
++i_qgroups;
}
}
@@ -2230,17 +2343,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
- ret = update_qgroup_limit_item(trans, quota_root, objectid,
- inherit->lim.flags,
- inherit->lim.max_rfer,
- inherit->lim.max_excl,
- inherit->lim.rsv_rfer,
- inherit->lim.rsv_excl);
- if (ret)
- goto out;
- }
-
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
@@ -2286,6 +2388,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
goto unlock;
}
+ if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+ dstgroup->lim_flags = inherit->lim.flags;
+ dstgroup->max_rfer = inherit->lim.max_rfer;
+ dstgroup->max_excl = inherit->lim.max_excl;
+ dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+ dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+ ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+ if (ret) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ dstgroup->qgroupid);
+ goto unlock;
+ }
+ }
+
if (srcid) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
@@ -2302,6 +2420,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
+
+ /* inherit the limit info */
+ dstgroup->lim_flags = srcgroup->lim_flags;
+ dstgroup->max_rfer = srcgroup->max_rfer;
+ dstgroup->max_excl = srcgroup->max_excl;
+ dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+ dstgroup->rsv_excl = srcgroup->rsv_excl;
+
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
}
@@ -2358,12 +2484,6 @@ out:
return ret;
}
-/*
- * reserve some space for a qgroup and all its parents. The reservation takes
- * place with start_transaction or dealloc_reserve, similar to ENOSPC
- * accounting. If not enough space is available, EDQUOT is returned.
- * We assume that the requested space is new for all qgroups.
- */
int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
@@ -2513,7 +2633,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
- * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ * returns 1 when done.
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
@@ -2522,7 +2642,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
{
struct btrfs_key found;
struct ulist *roots = NULL;
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
u64 seq;
int new_roots;
@@ -2618,6 +2738,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2660,7 +2781,7 @@ out:
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- if (err == 2 &&
+ if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
} else if (err < 0) {
@@ -2668,13 +2789,33 @@ out:
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
+ /*
+ * only update status, since the previous part has alreay updated the
+ * qgroup info.
+ */
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d\n",
+ err);
+ goto done;
+ }
+ ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
+ }
+ btrfs_end_transaction(trans, fs_info->quota_root);
+
if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
- err == 2 ? " (inconsistency flag cleared)" : "");
+ err > 0 ? " (inconsistency flag cleared)" : "");
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
+done:
complete_all(&fs_info->qgroup_rescan_completion);
}
@@ -2709,7 +2850,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
mutex_unlock(&fs_info->qgroup_rescan_lock);
goto err;
}
-
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 18cc68ca3090..c5242aa9a4b2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- char *name);
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..fa72068bd256 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x) {
- if (is_vmalloc_addr(x))
- vfree(x);
- else
- kfree(x);
- }
+ if (x)
+ kvfree(x);
return 0;
}
@@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
if (!info->stripe_hash_table)
return;
btrfs_clear_rbio_cache(info);
- if (is_vmalloc_addr(info->stripe_hash_table))
- vfree(info->stripe_hash_table);
- else
- kfree(info->stripe_hash_table);
+ kvfree(info->stripe_hash_table);
info->stripe_hash_table = NULL;
}
@@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int err;
int i;
- pointers = kzalloc(rbio->real_stripes * sizeof(void *),
- GFP_NOFS);
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = -ENOMEM;
goto cleanup_io;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..74b24b01d574 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
mutex_lock(&inode->i_mutex);
ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start);
+ 1 - cluster->start, 0);
if (ret)
goto out;
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
}
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct inode *inode, u64 ino)
+ struct btrfs_block_group_cache *block_group,
+ struct inode *inode,
+ u64 ino)
{
struct btrfs_key key;
struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ truncate:
goto out;
}
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
*/
if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
ret = delete_block_group_cache(rc->extent_root->fs_info,
+ rc->block_group,
NULL, ref_objectid);
if (ret != -ENOENT)
return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, inode, 0);
+ ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec57687c9a4d..ab5811545a98 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* the statistics.
*/
- sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
- sizeof(*sblocks_for_recheck),
- GFP_NOFS);
+ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
+ sizeof(*sblocks_for_recheck), GFP_NOFS);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
- int offset;
+ u32 offset;
int nsectors;
int sectorsize = sparity->sctx->dev_root->sectorsize;
@@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
start -= sparity->logic_start;
- offset = (int)do_div(start, sparity->stripe_len);
+ start = div_u64_rem(start, sparity->stripe_len, &offset);
offset /= sectorsize;
nsectors = (int)len / sectorsize;
@@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
int j = 0;
u64 stripe_nr;
u64 last_offset;
- int stripe_index;
- int rot;
+ u32 stripe_index;
+ u32 rot;
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
@@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num,
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
- stripe_nr = *offset;
- do_div(stripe_nr, map->stripe_len);
- do_div(stripe_nr, nr_data_stripes(map));
+ stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
- rot = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
@@ -2995,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nstripes = length;
physical = map->stripes[num].physical;
offset = 0;
- do_div(nstripes, map->stripe_len);
+ nstripes = div_u64(length, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
@@ -3563,7 +3560,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
int ret = 0;
- int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
if (fs_info->scrub_workers_refcnt == 0) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d6033f540cc7..a1216f9b4917 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
-static int path_loop(struct send_ctx *sctx, struct fs_path *name,
- u64 ino, u64 gen, u64 *ancestor_ino)
-{
- int ret = 0;
- u64 parent_inode = 0;
- u64 parent_gen = 0;
- u64 start_ino = ino;
-
- *ancestor_ino = 0;
- while (ino != BTRFS_FIRST_FREE_OBJECTID) {
- fs_path_reset(name);
-
- if (is_waiting_for_rm(sctx, ino))
- break;
- if (is_waiting_for_move(sctx, ino)) {
- if (*ancestor_ino == 0)
- *ancestor_ino = ino;
- ret = get_first_ref(sctx->parent_root, ino,
- &parent_inode, &parent_gen, name);
- } else {
- ret = __get_cur_name_and_parent(sctx, ino, gen,
- &parent_inode,
- &parent_gen, name);
- if (ret > 0) {
- ret = 0;
- break;
- }
- }
- if (ret < 0)
- break;
- if (parent_inode == start_ino) {
- ret = 1;
- if (*ancestor_ino == 0)
- *ancestor_ino = ino;
- break;
- }
- ino = parent_inode;
- gen = parent_gen;
- }
- return ret;
-}
-
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
@@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
int ret;
- u64 ancestor = 0;
name = fs_path_alloc();
from_path = fs_path_alloc();
@@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
goto out;
sctx->send_progress = sctx->cur_ino + 1;
- ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
- if (ret) {
- LIST_HEAD(deleted_refs);
- ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
- ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
- &pm->update_refs, &deleted_refs,
- pm->is_orphan);
- if (ret < 0)
- goto out;
- if (rmdir_ino) {
- dm = get_waiting_dir_move(sctx, pm->ino);
- ASSERT(dm);
- dm->rmdir_ino = rmdir_ino;
- }
- goto out;
- }
fs_path_reset(name);
to_path = name;
name = NULL;
@@ -3610,10 +3551,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
+ struct name_cache_entry *nce;
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
goto out;
+ /*
+ * Make sure we clear our orphanized inode's
+ * name from the name cache. This is because the
+ * inode ow_inode might be an ancestor of some
+ * other inode that will be orphanized as well
+ * later and has an inode number greater than
+ * sctx->send_progress. We need to prevent
+ * future name lookups from using the old name
+ * and get instead the orphan name.
+ */
+ nce = name_cache_search(sctx, ow_inode, ow_gen);
+ if (nce) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
} else {
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
@@ -5852,19 +5810,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
ret = PTR_ERR(clone_root);
goto out;
}
- clone_sources_to_rollback = i + 1;
spin_lock(&clone_root->root_item_lock);
- clone_root->send_in_progress++;
- if (!btrfs_root_readonly(clone_root)) {
+ if (!btrfs_root_readonly(clone_root) ||
+ btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
+ clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
+ clone_sources_to_rollback = i + 1;
}
vfree(clone_sources_tmp);
clone_sources_tmp = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 05fef198ff94..9e66f5e724db 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,6 +901,15 @@ find_root:
if (IS_ERR(new_root))
return ERR_CAST(new_root);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ int ret;
+ down_read(&fs_info->cleanup_work_sem);
+ ret = btrfs_orphan_cleanup(new_root);
+ up_read(&fs_info->cleanup_work_sem);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
dir_id = btrfs_root_dirid(&new_root->root_item);
setup_root:
location.objectid = dir_id;
@@ -916,7 +925,7 @@ setup_root:
* a reference to the dentry. We will have already gotten a reference
* to the inode in btrfs_fill_super so we're good to go.
*/
- if (!new && sb->s_root->d_inode == inode) {
+ if (!new && d_inode(sb->s_root) == inode) {
iput(inode);
return dget(sb->s_root);
}
@@ -1221,7 +1230,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
root = mount_subtree(mnt, subvol_name);
- if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
struct super_block *s = root->d_sb;
dput(root);
root = ERR_PTR(-EINVAL);
@@ -1714,7 +1723,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
- do_div(avail_space, BTRFS_STRIPE_LEN);
+ avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
avail_space *= BTRFS_STRIPE_LEN;
/*
@@ -1886,8 +1895,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+ buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
+ buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
return 0;
}
@@ -1908,6 +1917,17 @@ static struct file_system_type btrfs_fs_type = {
};
MODULE_ALIAS_FS("btrfs");
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The control file's private_data is used to hold the
+ * transaction when it is started and is used to keep
+ * track of whether a transaction is already in progress.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
/*
* used by btrfsctl to scan devices when no FS is mounted
*/
@@ -2009,6 +2029,7 @@ static const struct super_operations btrfs_super_ops = {
};
static const struct file_operations btrfs_ctl_fops = {
+ .open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = btrfs_control_ioctl,
.owner = THIS_MODULE,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 94edb0a2a026..e8a4c86d274d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
-static u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[3] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
[FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f7dd298b3cf6..3a4bbed723fd 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \
BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
/* convert from attribute */
-#define to_btrfs_feature_attr(a) \
- container_of(a, struct btrfs_feature_attr, kobj_attr)
-#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
-#define attr_to_btrfs_feature_attr(a) \
- to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+static inline struct btrfs_feature_attr *
+to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+ return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+ return container_of(attr, struct kobj_attribute, attr);
+}
+
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+ return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 73f299ebdabb..c32a7ba76bca 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
- ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+ ret = btrfs_create_qgroup(NULL, fs_info, 5);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
@@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root)
test_msg("Qgroup multiple refs test\n");
/* We have 5 created already from the previous test */
- ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+ ret = btrfs_create_qgroup(NULL, fs_info, 256);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8be4278e25e8..5628e25250c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,7 +35,7 @@
#define BTRFS_ROOT_TRANS_TAG 0
-static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
[TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
__TRANS_START),
@@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ if (transaction->delayed_refs.pending_csums)
+ printk(KERN_ERR "pending csums is %llu\n",
+ transaction->delayed_refs.pending_csums);
while (!list_empty(&transaction->pending_chunks)) {
struct extent_map *em;
@@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
- if (need_resched()) {
- spin_unlock(&tree->lock);
- cond_resched();
- spin_lock(&tree->lock);
- }
+
+ cond_resched_lock(&tree->lock);
}
spin_unlock(&tree->lock);
}
@@ -222,10 +222,12 @@ loop:
atomic_set(&cur_trans->use_count, 2);
cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
+ cur_trans->dirty_bg_run = 0;
cur_trans->delayed_refs.href_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.pending_csums = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
@@ -250,6 +252,9 @@ loop:
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ INIT_LIST_HEAD(&cur_trans->io_bgs);
+ mutex_init(&cur_trans->cache_write_mutex);
+ cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -721,7 +726,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates) {
- err = btrfs_run_delayed_refs(trans, root, updates);
+ err = btrfs_run_delayed_refs(trans, root, updates * 2);
if (err) /* Error code will also eval true */
return err;
}
@@ -1057,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+ struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1110,7 +1116,7 @@ again:
return ret;
}
- while (!list_empty(dirty_bgs)) {
+ while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans, root);
if (ret)
return ret;
@@ -1810,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
+ if (!cur_trans->dirty_bg_run) {
+ int run_it = 0;
+
+ /* this mutex is also taken before trying to set
+ * block groups readonly. We need to make sure
+ * that nobody has set a block group readonly
+ * after a extents from that block group have been
+ * allocated for cache files. btrfs_set_block_group_ro
+ * will wait for the transaction to commit if it
+ * finds dirty_bg_run = 1
+ *
+ * The dirty_bg_run flag is also used to make sure only
+ * one process starts all the block group IO. It wouldn't
+ * hurt to have more than one go through, but there's no
+ * real advantage to it either.
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (!cur_trans->dirty_bg_run) {
+ run_it = 1;
+ cur_trans->dirty_bg_run = 1;
+ }
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+ if (run_it)
+ ret = btrfs_start_dirty_block_groups(trans, root);
+ }
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
spin_lock(&root->fs_info->trans_lock);
list_splice(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2003,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..0b24755596ba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,9 +64,19 @@ struct btrfs_transaction {
struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
+ struct list_head io_bgs;
+ u64 num_dirty_bgs;
+
+ /*
+ * we need to make sure block group deletion doesn't race with
+ * free space cache writeout. This mutex keeps them from stomping
+ * on each other
+ */
+ struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
+ int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -136,9 +146,11 @@ struct btrfs_pending_snapshot {
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ spin_unlock(&BTRFS_I(inode)->lock);
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c5b8ba37f88e..d04968374e9d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
if (btrfs_inode_generation(eb, src_item) == 0) {
struct extent_buffer *dst_eb = path->nodes[0];
+ const u64 ino_size = btrfs_inode_size(eb, src_item);
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
- S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0) {
struct btrfs_map_token token;
- u64 ino_size = btrfs_inode_size(eb, src_item);
btrfs_init_map_token(&token);
btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -1951,6 +1959,104 @@ out:
return ret;
}
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const u64 ino)
+{
+ struct btrfs_key search_key;
+ struct btrfs_path *log_path;
+ int i;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+process_leaf:
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size_nr(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
+ btrfs_release_path(log_path);
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, ino,
+ name, name_len, -1);
+ kfree(name);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ search_key = key;
+ goto again;
+ }
+ kfree(name);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ }
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+out:
+ btrfs_free_path(log_path);
+ btrfs_release_path(path);
+ return ret;
+}
+
+
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
@@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ ret = replay_xattr_deletes(wc->trans, root, log,
+ path, key.objectid);
+ if (ret)
+ break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
@@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
+ clean_tree_block(trans, root->fs_info,
+ next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
+ clean_tree_block(trans, root->fs_info,
+ next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, log, next);
+ clean_tree_block(trans, log->fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
+ struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
@@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
src = path->nodes[0];
nritems = btrfs_header_nritems(src);
for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+
btrfs_item_key_to_cpu(src, &min_key, i);
if (min_key.objectid != ino || min_key.type != key_type)
@@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
err = ret;
goto done;
}
+
+ /*
+ * We must make sure that when we log a directory entry,
+ * the corresponding inode, after log replay, has a
+ * matching link count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our
+ * file inode would have a link count of 1, but we get
+ * two directory entries pointing to the same inode.
+ * After removing one of the names, it would not be
+ * possible to remove the other name, which resulted
+ * always in stale file handle errors, and would not
+ * be possible to rmdir the parent directory, since
+ * its i_size could never decrement to the value
+ * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+ */
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &tmp);
+ if (ctx &&
+ (btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ tmp.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
}
path->slots[0] = nritems;
@@ -3175,7 +3321,8 @@ done:
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
u64 min_key;
u64 max_key;
@@ -3187,7 +3334,7 @@ again:
max_key = 0;
while (1) {
ret = log_dir_items(trans, root, inode, path,
- dst_path, key_type, min_key,
+ dst_path, key_type, ctx, min_key,
&max_key);
if (ret)
return ret;
@@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
if (ret < 0) {
return ret;
} else if (ret > 0) {
- *size_ret = i_size_read(inode);
+ *size_ret = 0;
} else {
struct btrfs_inode_item *item;
@@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
- if (inode_only == LOG_INODE_EXISTS) {
- max_key_type = BTRFS_INODE_EXTREF_KEY;
- max_key.type = max_key_type;
- }
+ if (inode_only == LOG_INODE_EXISTS)
+ max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
if (inode_only == LOG_INODE_EXISTS) {
@@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
- max_key.type = BTRFS_INODE_EXTREF_KEY;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&BTRFS_I(inode)->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
+ while(1) {
+ ret = btrfs_truncate_inode_items(trans,
+ log, inode, 0, 0);
+ if (ret != -EAGAIN)
+ break;
+ }
}
- } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags) ||
+ } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode_only == LOG_INODE_ALL)
fast_search = true;
- max_key.type = BTRFS_XATTR_ITEM_KEY;
- } else {
- max_key.type = BTRFS_INODE_EXTREF_KEY;
- }
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4277,15 +4421,18 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path);
+ ret = log_directory_changes(trans, root, inode, path, dst_path,
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+ spin_unlock(&BTRFS_I(inode)->lock);
out_unlock:
if (unlikely(err))
btrfs_put_logged_extents(&logged_list);
@@ -4327,9 +4474,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
goto out;
if (!S_ISDIR(inode->i_mode)) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
goto out;
- inode = parent->d_inode;
+ inode = d_inode(parent);
}
while (1) {
@@ -4355,7 +4502,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
break;
}
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
if (IS_ROOT(parent))
@@ -4364,7 +4511,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
parent = dget_parent(parent);
dput(old_parent);
old_parent = parent;
- inode = parent->d_inode;
+ inode = d_inode(parent);
}
dput(old_parent);
@@ -4372,6 +4519,181 @@ out:
return ret;
}
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names. This does not result in a problem because if a dir_item key is
+ * logged but its matching dir_index key is not logged, at log replay time we
+ * don't use it to replay the respective name (see replay_one_name()). On the
+ * other hand if only the dir_index key ends up being logged, the respective
+ * name is added to the fs/subvol tree with both the dir_item and dir_index
+ * keys created (see replay_one_name()).
+ * The directory's inode item with a wrong i_size is not a problem as well,
+ * since we don't use it at log replay time to set the i_size in the inode
+ * item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+ dir_elem->ino = btrfs_ino(start_inode);
+ list_add_tail(&dir_elem->list, &dir_list);
+
+ while (!list_empty(&dir_list)) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ int nritems;
+ int i;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+ list);
+ if (ret)
+ goto next_dir_inode;
+
+ min_key.objectid = dir_elem->ino;
+ min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+
+process_leaf:
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ struct btrfs_dir_list *new_dir_elem;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != dir_elem->ino ||
+ min_key.type != BTRFS_DIR_ITEM_KEY)
+ goto next_dir_inode;
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid &&
+ type != BTRFS_FT_DIR)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+ root, NULL);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto next_dir_inode;
+ }
+
+ if (btrfs_inode_in_log(di_inode, trans->transid)) {
+ iput(di_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ btrfs_release_path(path);
+ ret = btrfs_log_inode(trans, root, di_inode,
+ log_mode, 0, LLONG_MAX, ctx);
+ iput(di_inode);
+ if (ret)
+ goto next_dir_inode;
+ if (ctx->log_new_dentries) {
+ new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+ GFP_NOFS);
+ if (!new_dir_elem) {
+ ret = -ENOMEM;
+ goto next_dir_inode;
+ }
+ new_dir_elem->ino = di_key.objectid;
+ list_add_tail(&new_dir_elem->list, &dir_list);
+ }
+ break;
+ }
+ if (i == nritems) {
+ ret = btrfs_next_leaf(log, path);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+ goto process_leaf;
+ }
+ if (min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+next_dir_inode:
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
const struct dentry * const first_parent = parent;
const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
last_committed);
+ bool log_dentries = false;
+ struct inode *orig_inode = inode;
sb = inode->i_sb;
@@ -4449,11 +4773,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+ log_dentries = true;
+
while (1) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
- inode = parent->d_inode;
+ inode = d_inode(parent);
if (root != BTRFS_I(inode)->root)
break;
@@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
dput(old_parent);
old_parent = parent;
}
- ret = 0;
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ else
+ ret = 0;
end_trans:
dput(old_parent);
if (ret < 0) {
@@ -4515,7 +4845,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
start, end, 0, ctx);
dput(parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
int io_err;
+ bool log_new_dentries;
struct list_head list;
};
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->io_err = 0;
+ ctx->log_new_dentries = false;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..96aebf3bcd5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,8 +366,8 @@ loop_lock:
btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
- if (need_resched())
- cond_resched();
+
+ cond_resched();
/*
* we made progress, there is more work to do and the bdi
@@ -400,8 +400,7 @@ loop_lock:
* against it before looping
*/
last_waited = ioc->last_waited;
- if (need_resched())
- cond_resched();
+ cond_resched();
continue;
}
spin_lock(&device->io_lock);
@@ -609,8 +608,7 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
@@ -1060,6 +1058,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
struct extent_map *em;
struct list_head *search_list = &trans->transaction->pending_chunks;
int ret = 0;
+ u64 physical_start = *start;
again:
list_for_each_entry(em, search_list, list) {
@@ -1070,9 +1069,9 @@ again:
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev != device)
continue;
- if (map->stripes[i].physical >= *start + len ||
+ if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
- *start)
+ physical_start)
continue;
*start = map->stripes[i].physical +
em->orig_block_len;
@@ -1136,11 +1135,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-again:
+
max_hole_start = search_start;
max_hole_size = 0;
- hole_size = 0;
+again:
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
goto out;
@@ -1195,8 +1194,14 @@ again:
*/
if (contains_pending_extent(trans, device,
&search_start,
- hole_size))
- hole_size = 0;
+ hole_size)) {
+ if (key.offset >= search_start) {
+ hole_size = key.offset - search_start;
+ } else {
+ WARN_ON_ONCE(1);
+ hole_size = 0;
+ }
+ }
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -1233,21 +1238,23 @@ next:
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
- if (search_end > search_start)
+ if (search_end > search_start) {
hole_size = search_end - search_start;
- if (hole_size > max_hole_size) {
- max_hole_start = search_start;
- max_hole_size = hole_size;
- }
+ if (contains_pending_extent(trans, device, &search_start,
+ hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
- if (contains_pending_extent(trans, device, &search_start, hole_size)) {
- btrfs_release_path(path);
- goto again;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
}
/* See above. */
- if (hole_size < num_bytes)
+ if (max_hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
@@ -2487,8 +2494,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
+ struct btrfs_root *root, u64 chunk_objectid,
u64 chunk_offset)
{
int ret;
@@ -2580,7 +2586,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct map_lookup *map;
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- u64 chunk_tree = root->fs_info->chunk_root->objectid;
int i, ret = 0;
/* Just in case */
@@ -2634,8 +2639,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
}
}
- ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
- chunk_offset);
+ ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
@@ -2664,8 +2668,8 @@ out:
}
static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+ u64 chunk_objectid,
+ u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
@@ -2707,7 +2711,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
- u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
bool retried = false;
int failed = 0;
@@ -2744,7 +2747,7 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ ret = btrfs_relocate_chunk(chunk_root,
found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
@@ -3022,7 +3025,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
- do_div(stripe_length, factor);
+ stripe_length = div_u64(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
@@ -3255,7 +3258,6 @@ again:
}
ret = btrfs_relocate_chunk(chunk_root,
- chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
@@ -3957,7 +3959,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
- u64 chunk_tree;
u64 chunk_objectid;
u64 chunk_offset;
int ret;
@@ -4027,13 +4028,11 @@ again:
break;
}
- chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
- chunk_offset);
+ ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -4131,7 +4130,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
@@ -4289,7 +4288,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
- devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
@@ -4400,8 +4399,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
*/
if (stripe_size * data_stripes > max_chunk_size) {
u64 mask = (1ULL << 24) - 1;
- stripe_size = max_chunk_size;
- do_div(stripe_size, data_stripes);
+
+ stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
stripe_size = (stripe_size + mask) & ~mask;
@@ -4413,10 +4412,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = devices_info[ndevs-1].max_avail;
}
- do_div(stripe_size, dev_stripes);
+ stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- do_div(stripe_size, raid_stripe_len);
+ stripe_size = div_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4954,7 +4953,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
- int stripe_index;
+ u32 stripe_index;
int i;
int ret = 0;
int num_stripes;
@@ -4995,7 +4994,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- do_div(stripe_nr, stripe_len);
+ stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
BUG_ON(offset < stripe_offset);
@@ -5011,7 +5010,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* allow a write of a full stripe, but make sure we don't
* allow straddling of stripes
*/
- do_div(raid56_full_stripe_start, full_stripe_len);
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+ full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
}
@@ -5136,7 +5136,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_index = 0;
stripe_nr_orig = stripe_nr;
stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- do_div(stripe_nr_end, map->stripe_len);
+ stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + *length);
@@ -5144,7 +5144,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
- stripe_index = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5170,9 +5171,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
+ u32 factor = map->num_stripes / map->sub_stripes;
- stripe_index = do_div(stripe_nr, factor);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5198,8 +5199,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = raid56_full_stripe_start;
- do_div(stripe_nr, stripe_len * nr_data_stripes(map));
+ stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@@ -5209,32 +5210,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_index = 0;
stripe_offset = 0;
} else {
- u64 tmp;
-
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
- stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ stripe_nr = div_u64_rem(stripe_nr,
+ nr_data_stripes(map), &stripe_index);
if (mirror_num > 1)
stripe_index = nr_data_stripes(map) +
mirror_num - 2;
/* We distribute the parity blocks across stripes */
- tmp = stripe_nr + stripe_index;
- stripe_index = do_div(tmp, map->num_stripes);
+ div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+ &stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD |
REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
- * after this do_div call, stripe_nr is the number of stripes
- * on this device we have to walk to find the data, and
- * stripe_index is the number of our device in the stripe array
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
*/
- stripe_index = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
mirror_num = stripe_index + 1;
}
BUG_ON(stripe_index >= map->num_stripes);
@@ -5261,7 +5262,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
u64 tmp;
- int i, rot;
+ unsigned rot;
bbio->raid_map = (u64 *)((void *)bbio->stripes +
sizeof(struct btrfs_bio_stripe) *
@@ -5269,8 +5270,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
sizeof(int) * tgtdev_indexes);
/* Work out the disk rotation on this stripe-set */
- tmp = stripe_nr;
- rot = do_div(tmp, num_stripes);
+ div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * nr_data_stripes(map);
@@ -5285,8 +5285,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
if (rw & REQ_DISCARD) {
- int factor = 0;
- int sub_stripes = 0;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
@@ -5437,9 +5437,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
if (found) {
- u64 length = map->stripe_len;
-
- if (physical_of_found + length <=
+ if (physical_of_found + map->stripe_len <=
dev_replace->cursor_left) {
struct btrfs_bio_stripe *tgtdev_stripe =
bbio->stripes + num_stripes;
@@ -5535,15 +5533,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- do_div(length, map->num_stripes / map->sub_stripes);
+ length = div_u64(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- do_div(length, map->num_stripes);
+ length = div_u64(length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- do_div(length, nr_data_stripes(map));
+ length = div_u64(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
- buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
@@ -5554,11 +5552,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
continue;
stripe_nr = physical - map->stripes[i].physical;
- do_div(stripe_nr, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
- do_div(stripe_nr, map->sub_stripes);
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
} /* else if RAID[56], multiply by nr_data_stripes().
@@ -5835,8 +5833,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 length = 0;
u64 map_length;
int ret;
- int dev_nr = 0;
- int total_devs = 1;
+ int dev_nr;
+ int total_devs;
struct btrfs_bio *bbio = NULL;
length = bio->bi_iter.bi_size;
@@ -5877,11 +5875,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG();
}
- while (dev_nr < total_devs) {
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
bbio_error(bbio, first_bio, logical);
- dev_nr++;
continue;
}
@@ -5894,7 +5891,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
ret = breakup_stripe_bio(root, bbio, first_bio, dev,
dev_nr, rw, async_submit);
BUG_ON(ret);
- dev_nr++;
continue;
}
@@ -5909,7 +5905,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
submit_stripe_bio(root, bbio, bio,
bbio->stripes[dev_nr].physical, dev_nr, rw,
async_submit);
- dev_nr++;
}
btrfs_bio_counter_dec(root->fs_info);
return 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 83069dec6898..ebc31331a837 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 883b93623bc5..6f518c90e1c1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -261,7 +261,7 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key, found_key;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
/*
* Check if the attribute is in a supported namespace.
*
- * This applied after the check for the synthetic attributes in the system
+ * This is applied after the check for the synthetic attributes in the system
* namespace.
*/
-static bool btrfs_is_valid_xattr(const char *name)
+static int btrfs_is_valid_xattr(const char *name)
{
- return !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
- !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
+ int len = strlen(name);
+ int prefixlen = 0;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN))
+ prefixlen = XATTR_SECURITY_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ prefixlen = XATTR_USER_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ prefixlen = XATTR_BTRFS_PREFIX_LEN;
+ else
+ return -EOPNOTSUPP;
+
+ /*
+ * The name cannot consist of just prefix
+ */
+ if (len <= prefixlen)
+ return -EINVAL;
+
+ return 0;
}
ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
+ int ret;
+
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
@@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_getxattr(dentry, name, buffer, size);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
- return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
+ return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
}
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
- struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_setxattr(dentry, name, value, size, flags);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(dentry->d_inode, name,
+ return btrfs_set_prop(d_inode(dentry), name,
value, size, flags);
if (size == 0)
value = ""; /* empty EA, do not remove */
- return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
- struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_removexattr(dentry, name);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(dentry->d_inode, name,
+ return btrfs_set_prop(d_inode(dentry), name,
NULL, 0, XATTR_REPLACE);
- return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
XATTR_REPLACE);
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index fb22fd8d8fb8..82990b8f872b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -403,7 +403,7 @@ next:
return ret;
}
-struct btrfs_compress_op btrfs_zlib_compress = {
+const struct btrfs_compress_op btrfs_zlib_compress = {
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,