summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.h205
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/disk-io.c20
-rw-r--r--fs/btrfs/extent-tree.c203
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/free-space-cache.c112
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/ioctl.c226
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/super.c49
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c790
-rw-r--r--fs/btrfs/volumes.h51
14 files changed, 1491 insertions, 240 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..dfc136cc07d7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
__le16 name_len;
} __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
} __attribute__ ((__packed__));
/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
-#define BTRFS_NR_RAID_TYPES 5
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES 5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
struct btrfs_block_group_item {
__le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_balance_control;
struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
- u64 data_alloc_profile;
- u64 metadata_alloc_profile;
- u64 system_alloc_profile;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
unsigned data_chunk_allocations;
unsigned metadata_ratio;
@@ -1383,6 +1464,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+#define BTRFS_BALANCE_ITEM_KEY 248
+
/*
* string items are for debugging. They just store a short string of
* data in the FS
@@ -1413,6 +1496,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2161,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2500,6 +2662,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
+ kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
@@ -2510,6 +2673,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (extended)
+ mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & mask)
+ return 0;
+ /* true if zero or exactly one bit set */
+ return (flags & (~flags + 1)) == flags;
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c7ddf8a01c54..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1719,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_gid = btrfs_stack_inode_gid(inode_item);
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
- inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+ set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 858ab347413e..e5167219c266 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2002,12 +2002,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- fs_info->btree_inode->i_nlink = 1;
+ set_nlink(fs_info->btree_inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -2319,9 +2327,6 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
ret = btrfs_init_space_info(fs_info);
if (ret) {
@@ -2424,6 +2429,10 @@ retry_root_backup:
if (!err)
err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+
+ if (!err)
+ err = btrfs_recover_balance(fs_info->tree_root);
+
if (err) {
close_ctree(tree_root);
return ERR_PTR(err);
@@ -2973,6 +2982,9 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(root->fs_info);
+
btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bf30f670cda9..1c1cf216be80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -2267,9 +2266,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(extent_op);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- continue;
+ goto next;
}
list_del_init(&locked_ref->cluster);
@@ -2289,7 +2286,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
-
+next:
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
@@ -2317,6 +2318,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
@@ -2993,9 +2998,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
- found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
- BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
found->total_bytes = total_bytes;
found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
@@ -3016,20 +3019,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP);
- if (extra_flags) {
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
- }
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
}
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
@@ -3040,6 +3050,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ /* pick restriper's target profile if it's available */
+ spin_lock(&root->fs_info->balance_lock);
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+ (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->data.target)) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->sys.target)) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->meta.target)) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ spin_unlock(&root->fs_info->balance_lock);
+ flags = tgt;
+ goto out;
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
+
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
if (num_devices < 4)
@@ -3059,22 +3097,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
((flags & BTRFS_BLOCK_GROUP_RAID1) |
(flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP)))
+ (flags & BTRFS_BLOCK_GROUP_DUP))) {
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+ }
+
+out:
+ /* extended -> chunk profile */
+ flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
return flags;
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits &
- root->fs_info->data_alloc_profile;
+ flags |= root->fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits &
- root->fs_info->system_alloc_profile;
+ flags |= root->fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits &
- root->fs_info->metadata_alloc_profile;
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3257,27 +3298,12 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
-
- /*
- * we have two similar checks here, one based on percentage
- * and once based on a hard number of 256MB. The idea
- * is that if we have a good amount of free
- * room, don't allocate a chunk. A good mount is
- * less than 80% utilized of the chunks we have allocated,
- * or more than 256MB free
- */
- if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
- return 0;
-
- if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
- return 0;
-
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /* 256MB or 5% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+ /* 256MB or 2% of the FS */
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3291,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- flags = btrfs_reduce_alloc_profile(extent_root, flags);
+ BUG_ON(!profile_is_valid(flags, 0));
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
@@ -3416,7 +3442,8 @@ static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
smp_mb();
nr_pages = min_t(unsigned long, nr_pages,
root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+ writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
spin_lock(&space_info->lock);
if (reserved > space_info->bytes_may_use)
@@ -5294,15 +5321,6 @@ alloc:
if (unlikely(block_group->ro))
goto loop;
- spin_lock(&block_group->free_space_ctl->tree_lock);
- if (cached &&
- block_group->free_space_ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- spin_unlock(&block_group->free_space_ctl->tree_lock);
- goto loop;
- }
- spin_unlock(&block_group->free_space_ctl->tree_lock);
-
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
@@ -5348,8 +5366,15 @@ refill_cluster:
* plenty of times and not have found
* anything, so we are likely way too
* fragmented for the clustering stuff to find
- * anything. */
- if (loop >= LOOP_NO_EMPTY_SIZE) {
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ last_ptr->block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
}
@@ -5360,6 +5385,11 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
@@ -5400,6 +5430,15 @@ refill_cluster:
}
unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5437,9 +5476,6 @@ checks:
goto loop;
}
- ins->objectid = search_start;
- ins->offset = num_bytes;
-
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -6791,6 +6827,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ /* pick restriper's target profile and return */
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ /* extended -> chunk profile */
+ tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ return tgt;
+ }
+ }
+
/*
* we add in the count of missing devices because we want
* to make sure that any RAID levels on a degraded FS
@@ -7466,6 +7525,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return 0;
}
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start)
{
@@ -7476,6 +7551,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct inode *inode;
int ret;
+ int index;
int factor;
root = root->fs_info->extent_root;
@@ -7491,6 +7567,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, block_group);
memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7642,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index]))
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cc7492c823f3..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
(sizeof(struct page *)));
+ nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+ nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -1836,7 +1838,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
switch (origin) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek_unlocked(file, offset, origin);
+ offset = generic_file_llseek(file, offset, origin);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b3cbb8939fa3..6c7887a7770c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2289,23 +2289,23 @@ out:
static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 min_bytes)
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
- unsigned long search_bits;
- unsigned long total_bits;
+ unsigned long want_bits;
+ unsigned long min_bits;
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
- bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -2314,7 +2314,7 @@ again:
i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
- if (next_zero - i >= search_bits) {
+ if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
break;
}
@@ -2324,10 +2324,9 @@ again:
if (!found_bits)
return -ENOSPC;
- if (!found) {
+ if (!total_found) {
start = i;
cluster->max_size = 0;
- found = true;
}
total_found += found_bits;
@@ -2335,13 +2334,8 @@ again:
if (cluster->max_size < found_bits * block_group->sectorsize)
cluster->max_size = found_bits * block_group->sectorsize;
- if (total_found < total_bits) {
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
- if (i - start > total_bits * 2) {
- total_found = 0;
- cluster->max_size = 0;
- found = false;
- }
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
goto again;
}
@@ -2357,23 +2351,23 @@ again:
/*
* This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
- struct btrfs_free_space *prev = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_start;
u64 window_free;
u64 max_extent;
- u64 max_gap = 128 * 1024;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
@@ -2383,8 +2377,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
- while (entry->bitmap) {
- if (list_empty(&entry->list))
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
@@ -2397,12 +2391,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
max_extent = entry->bytes;
first = entry;
last = entry;
- prev = entry;
- while (window_free <= min_bytes) {
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
@@ -2411,26 +2402,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
continue;
}
- /*
- * we haven't filled the empty size and the window is
- * very large. reset and try again
- */
- if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
- entry->offset - window_start > (min_bytes * 2)) {
- first = entry;
- window_start = entry->offset;
- window_free = entry->bytes;
- last = entry;
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
max_extent = entry->bytes;
- } else {
- last = entry;
- window_free += entry->bytes;
- if (entry->bytes > max_extent)
- max_extent = entry->bytes;
- }
- prev = entry;
}
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
cluster->window_start = first->offset;
node = &first->offset_index;
@@ -2444,7 +2427,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap)
+ if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
@@ -2466,7 +2449,7 @@ static noinline int
setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
@@ -2491,7 +2474,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
if (entry->bytes < min_bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
+ bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
@@ -2505,7 +2488,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
/*
* here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
@@ -2521,23 +2504,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
+ u64 cont1_bytes;
int ret;
- /* for metadata, allow allocates with more holes */
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
+ cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
spin_lock(&ctl->tree_lock);
@@ -2545,7 +2529,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
- if (ctl->free_space < min_bytes) {
+ if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
@@ -2559,10 +2543,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
}
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
- bytes, min_bytes);
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
- offset, bytes, min_bytes);
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13b0542015ff..fd1a06df5bc6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2358,7 +2358,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+ set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
inode->i_uid = btrfs_inode_uid(leaf, inode_item);
inode->i_gid = btrfs_inode_gid(leaf, inode_item);
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -6698,7 +6698,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, new_root, inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fe8a60c865eb..ef909b5d3d2e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1213,13 +1213,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&root->fs_info->volume_mutex);
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
@@ -1236,7 +1244,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
@@ -1251,7 +1259,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
new_size = memparse(sizestr, NULL);
if (new_size == 0) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
}
@@ -1260,7 +1268,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
new_size = old_size - new_size;
} else if (mod > 0) {
@@ -1269,11 +1277,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (new_size < 256 * 1024 * 1024) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_unlock;
+ goto out_free;
}
do_div(new_size, root->sectorsize);
@@ -1286,7 +1294,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_unlock;
+ goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
@@ -1294,9 +1302,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = btrfs_shrink_device(device, new_size);
}
-out_unlock:
- mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2062,14 +2071,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2084,14 +2104,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -3044,6 +3075,163 @@ out:
return ret;
}
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (atomic_read(&fs_info->balance_running))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ if (lock) {
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (arg) {
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ goto out;
+ }
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_bargs;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ goto do_balance;
+ }
+ } else {
+ bargs = NULL;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_bargs;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_bargs;
+ }
+
+ bctl->fs_info = fs_info;
+ if (arg) {
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+ } else {
+ /* balance everything - no filters */
+ bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ }
+
+do_balance:
+ ret = btrfs_balance(bctl, bargs);
+ /*
+ * bctl is freed in __cancel_balance or in free_fs_info if
+ * restriper was paused all the way until unmount
+ */
+ if (arg) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+out_bargs:
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(root->fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(root->fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_ioctl_balance_args(fs_info, 1, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3088,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_balance(root->fs_info->dev_root);
+ return btrfs_ioctl_balance(root, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3120,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(root, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(root, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
__u64 reserved[124]; /* pad to 1k */
};
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE 1
+#define BTRFS_BALANCE_CTL_CANCEL 2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+ __u64 profiles;
+ __u64 usage;
+ __u64 devid;
+ __u64 pstart;
+ __u64 pend;
+ __u64 vstart;
+ __u64 vend;
+
+ __u64 target;
+
+ __u64 flags;
+
+ __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+ __u64 expected; /* estimated # of chunks that will be
+ * relocated to fulfill the request */
+ __u64 considered; /* # of chunks we have considered so far */
+ __u64 completed; /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+ __u64 flags; /* in/out */
+ __u64 state; /* out */
+
+ struct btrfs_balance_args data; /* in/out */
+ struct btrfs_balance_args meta; /* in/out */
+ struct btrfs_balance_args sys; /* in/out */
+
+ struct btrfs_balance_progress stat; /* out */
+
+ __u64 unused[72]; /* pad to 1k */
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+ struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+ struct btrfs_ioctl_balance_args)
#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 34a8b6112ea4..5a7227fa9380 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,8 +164,9 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+ Opt_err,
};
static match_table_t tokens = {
@@ -200,6 +201,7 @@ static match_table_t tokens = {
{Opt_inode_cache, "inode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
+ {Opt_skip_balance, "skip_balance"},
{Opt_err, NULL},
};
@@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(root, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
+ if (btrfs_test_opt(root, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
return 0;
}
@@ -826,13 +833,9 @@ static char *setup_root_args(char *args)
static struct dentry *mount_subvol(const char *subvol_name, int flags,
const char *device_name, char *data)
{
- struct super_block *s;
struct dentry *root;
struct vfsmount *mnt;
- struct mnt_namespace *ns_private;
char *newargs;
- struct path path;
- int error;
newargs = setup_root_args(data);
if (!newargs)
@@ -843,39 +846,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
if (IS_ERR(mnt))
return ERR_CAST(mnt);
- ns_private = create_mnt_ns(mnt);
- if (IS_ERR(ns_private)) {
- mntput(mnt);
- return ERR_CAST(ns_private);
- }
-
- /*
- * This will trigger the automount of the subvol so we can just
- * drop the mnt we have here and return the dentry that we
- * found.
- */
- error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
- LOOKUP_FOLLOW, &path);
- put_mnt_ns(ns_private);
- if (error)
- return ERR_PTR(error);
+ root = mount_subtree(mnt, subvol_name);
- if (!is_subvolume_inode(path.dentry->d_inode)) {
- path_put(&path);
- mntput(mnt);
- error = -EINVAL;
+ if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ struct super_block *s = root->d_sb;
+ dput(root);
+ root = ERR_PTR(-EINVAL);
+ deactivate_locked_super(s);
printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
subvol_name);
- return ERR_PTR(-EINVAL);
}
- /* Get a ref to the sb and the dentry we found and return it */
- s = path.mnt->mnt_sb;
- atomic_inc(&s->s_active);
- root = dget(path.dentry);
- path_put(&path);
- down_write(&s->s_umount);
-
return root;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..360c2dfd1ee6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- while (count < 4) {
+ while (count < 2) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
trans->transaction->delayed_refs.num_heads_ready > 64) {
trans->delayed_ref_updates = 0;
-
- /*
- * do a full flush if the transaction is trying
- * to close
- */
- if (trans->transaction->delayed_refs.flushing)
- cur = 0;
btrfs_run_delayed_refs(trans, root, cur);
} else {
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4d81c06d48f..3568374d419d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1031,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
if (nlink != inode->i_nlink) {
- inode->i_nlink = nlink;
+ set_nlink(inode, nlink);
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fbb493b28d5a..e0b7bb92a170 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -1280,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bool clear_super = false;
mutex_lock(&uuid_mutex);
- mutex_lock(&root->fs_info->volume_mutex);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
@@ -1450,7 +1450,6 @@ error_close:
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
- mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -1626,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
- mutex_lock(&root->fs_info->volume_mutex);
devices = &root->fs_info->fs_devices->devices;
/*
@@ -1754,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
BUG_ON(ret);
}
-out:
- mutex_unlock(&root->fs_info->volume_mutex);
+
return ret;
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -1763,7 +1760,7 @@ error:
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
- goto out;
+ return ret;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2074,6 +2071,362 @@ error:
return ret;
}
+static int insert_balance_item(struct btrfs_root *root,
+ struct btrfs_balance_control *bctl)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ btrfs_set_balance_data(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+ btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper. Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+ BUG_ON(fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ BUG_ON(!fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = NULL;
+ spin_unlock(&fs_info->balance_lock);
+
+ kfree(bctl);
+}
+
+/*
+ * Balance filters. Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+ struct btrfs_balance_args *bargs)
+{
+ chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (chunk_profile == 0)
+ chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (bargs->profiles & chunk_profile)
+ return 0;
+
+ return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor <= 0)
+ return 0;
+ if (factor >= 100)
+ return num;
+
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 chunk_used, user_thresh;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (chunk_used < user_thresh)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ int i;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ u64 stripe_offset;
+ u64 stripe_length;
+ int factor;
+ int i;
+
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ return 0;
+
+ if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+ factor = num_stripes / factor;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ continue;
+
+ stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ stripe_length = btrfs_chunk_length(leaf, chunk);
+ do_div(stripe_length, factor);
+
+ if (stripe_offset < bargs->pend &&
+ stripe_offset + stripe_length > bargs->pstart)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ if (chunk_offset < bargs->vend &&
+ chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ /* at least part of the chunk is inside this vrange */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_profile,
+ struct btrfs_balance_args *bargs)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return 0;
+
+ chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (chunk_profile == 0)
+ chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (bargs->target & chunk_profile)
+ return 1;
+
+ return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ struct btrfs_balance_args *bargs = NULL;
+ u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ /* type filter */
+ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ return 0;
+ }
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ bargs = &bctl->data;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ bargs = &bctl->sys;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ bargs = &bctl->meta;
+
+ /* profiles filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ chunk_profiles_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /* usage filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ chunk_devid_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* drange filter, makes sense only with devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* vrange filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* soft profile changing mode */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ chunk_soft_convert_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ return 1;
+}
+
static u64 div_factor(u64 num, int factor)
{
if (factor == 10)
@@ -2083,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
return num;
}
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct list_head *devices;
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
- struct btrfs_trans_handle *trans;
struct btrfs_key found_key;
-
- if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&dev_root->fs_info->volume_mutex);
- dev_root = dev_root->fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ int slot;
+ int ret;
+ int enospc_errors = 0;
+ bool counting = true;
/* step one make some room on all the devices */
+ devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = device->total_bytes;
size_to_free = div_factor(old_size, 1);
@@ -2134,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
ret = -ENOMEM;
goto error;
}
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
@@ -2148,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
* failed
*/
if (ret == 0)
- break;
+ BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
- if (ret)
+ if (ret) {
+ ret = 0;
break;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid)
break;
@@ -2164,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (found_key.offset == 0)
break;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ret = should_balance_chunk(chunk_root, leaf, chunk,
+ found_key.offset);
btrfs_release_path(path);
+ if (!ret)
+ goto loop;
+
+ if (counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+ goto loop;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
goto error;
+ if (ret == -ENOSPC) {
+ enospc_errors++;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+loop:
key.offset = found_key.offset - 1;
}
- ret = 0;
+
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
error:
btrfs_free_path(path);
- mutex_unlock(&dev_root->fs_info->volume_mutex);
+ if (enospc_errors) {
+ printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+ enospc_errors);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+
return ret;
}
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+ /* cancel requested || normal exit path */
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ unset_balance_control(fs_info);
+ ret = del_balance_item(fs_info->tree_root);
+ BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 allowed;
+ int ret;
+
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * In case of mixed groups both data and meta should be picked,
+ * and identical options should be given for both of them.
+ */
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+ if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ printk(KERN_ERR "btrfs: with mixed groups data and "
+ "metadata balance options must be the same\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Profile changing sanity checks. Skip them if a simple
+ * balance is requested.
+ */
+ if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+ BTRFS_BALANCE_ARGS_CONVERT))
+ goto do_balance;
+
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ if (fs_info->fs_devices->num_devices == 1)
+ allowed |= BTRFS_BLOCK_GROUP_DUP;
+ else if (fs_info->fs_devices->num_devices < 4)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+ else
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10);
+
+ if (!profile_is_valid(bctl->data.target, 1) ||
+ bctl->data.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "data profile %llu\n",
+ (unsigned long long)bctl->data.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!profile_is_valid(bctl->meta.target, 1) ||
+ bctl->meta.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "metadata profile %llu\n",
+ (unsigned long long)bctl->meta.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!profile_is_valid(bctl->sys.target, 1) ||
+ bctl->sys.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "system profile %llu\n",
+ (unsigned long long)bctl->sys.target);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+ printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow to reduce meta or sys integrity only if force set */
+ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10;
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ printk(KERN_INFO "btrfs: force reducing metadata "
+ "integrity\n");
+ } else {
+ printk(KERN_ERR "btrfs: balance will reduce metadata "
+ "integrity, use force if you want this\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+do_balance:
+ ret = insert_balance_item(fs_info->tree_root, bctl);
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ set_balance_control(bctl);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ atomic_inc(&fs_info->balance_running);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ ret = __btrfs_balance(fs_info);
+
+ mutex_lock(&fs_info->balance_mutex);
+ atomic_dec(&fs_info->balance_running);
+
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
+ if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ balance_need_close(fs_info)) {
+ __cancel_balance(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
+
+ return ret;
+out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ __cancel_balance(fs_info);
+ else
+ kfree(bctl);
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_balance_control *bctl =
+ (struct btrfs_balance_control *)data;
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ int ret = 0;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ set_balance_control(bctl);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ } else {
+ printk(KERN_INFO "btrfs: continuing balance\n");
+ ret = btrfs_balance(bctl, NULL);
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+ struct task_struct *tsk;
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_bctl;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out_bctl;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->fs_info = tree_root->fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+ if (IS_ERR(tsk))
+ ret = PTR_ERR(tsk);
+ else
+ goto out;
+
+out_bctl:
+ kfree(bctl);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (atomic_read(&fs_info->balance_running)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ /* __cancel_balance needs volume_mutex */
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl)
+ __cancel_balance(fs_info);
+
+ mutex_unlock(&fs_info->volume_mutex);
+ }
+
+ BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -2437,7 +3158,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_stripe_size = 256 * 1024 * 1024;
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 8 * 1024 * 1024;
@@ -2748,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
return ret;
alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- (fs_info->metadata_alloc_profile &
- fs_info->avail_metadata_alloc_bits);
+ fs_info->avail_metadata_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2759,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
sys_chunk_offset = chunk_offset + chunk_size;
alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- (fs_info->system_alloc_profile &
- fs_info->avail_system_alloc_bits);
+ fs_info->avail_system_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2937,10 +3660,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
if (rw & REQ_DISCARD)
*length = min_t(u64, em->len - offset, *length);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
/* we limit the length of each bio to what fits in a stripe */
*length = min_t(u64, em->len - offset,
map->stripe_len - stripe_offset);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c1701ec9d49f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
+#define BTRFS_BALANCE_METADATA (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
+ BTRFS_BALANCE_SYSTEM | \
+ BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE (1ULL << 3)
+#define BTRFS_BALANCE_RESUME (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+
+/*
+ * Profile changing flags. When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+ struct btrfs_fs_info *fs_info;
+
+ struct btrfs_balance_args data;
+ struct btrfs_balance_args meta;
+ struct btrfs_balance_args sys;
+
+ u64 flags;
+
+ struct btrfs_balance_progress stat;
+};
+
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
@@ -228,7 +273,11 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);