diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/ctree.h | 205 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 20 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 203 | ||||
-rw-r--r-- | fs/btrfs/file.c | 4 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 112 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 226 | ||||
-rw-r--r-- | fs/btrfs/ioctl.h | 54 | ||||
-rw-r--r-- | fs/btrfs/super.c | 49 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 9 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 2 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 790 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 51 |
14 files changed, 1491 insertions, 240 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323d..dfc136cc07d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,9 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -692,6 +695,54 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + __le64 unused[8]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -751,14 +802,32 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_NR_RAID_TYPES 5 +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_NR_RAID_TYPES 5 + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) struct btrfs_block_group_item { __le64 used; @@ -916,6 +985,7 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -1132,12 +1202,23 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; + + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -1383,6 +1464,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_BALANCE_ITEM_KEY 248 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -1413,6 +1496,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2077,8 +2161,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_super_block */ +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); +} + +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, @@ -2500,6 +2662,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); @@ -2510,6 +2673,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } +/** + * profile_is_valid - tests whether a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static inline int profile_is_valid(u64 flags, int extended) +{ + u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + if (extended) + mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & mask) + return 0; + /* true if zero or exactly one bit set */ + return (flags & (~flags + 1)) == flags; +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7ddf8a01c54..9c1eccc2c503 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1719,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_gid = btrfs_stack_inode_gid(inode_item); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); - inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 858ab347413e..e5167219c266 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2002,12 +2002,20 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); + sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -2319,9 +2327,6 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { @@ -2424,6 +2429,10 @@ retry_root_backup: if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + + if (!err) + err = btrfs_recover_balance(fs_info->tree_root); + if (err) { close_ctree(tree_root); return ERR_PTR(err); @@ -2973,6 +2982,9 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(root->fs_info); + btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bf30f670cda9..1c1cf216be80 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -2267,9 +2266,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; + goto next; } list_del_init(&locked_ref->cluster); @@ -2289,7 +2286,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; - +next: + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -2317,6 +2318,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: @@ -2993,9 +2998,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | - BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3016,20 +3019,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); - if (extra_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - } + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; } +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* @@ -3040,6 +3050,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + /* pick restriper's target profile if it's available */ + spin_lock(&root->fs_info->balance_lock); + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && + (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->data.target)) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && + (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->sys.target)) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && + (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->meta.target)) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + spin_unlock(&root->fs_info->balance_lock); + flags = tgt; + goto out; + } + } + spin_unlock(&root->fs_info->balance_lock); + if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3059,22 +3097,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) + (flags & BTRFS_BLOCK_GROUP_DUP))) { flags &= ~BTRFS_BLOCK_GROUP_RAID0; + } + +out: + /* extended -> chunk profile */ + flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits & - root->fs_info->data_alloc_profile; + flags |= root->fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits & - root->fs_info->system_alloc_profile; + flags |= root->fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits & - root->fs_info->metadata_alloc_profile; + flags |= root->fs_info->avail_metadata_alloc_bits; + return btrfs_reduce_alloc_profile(root, flags); } @@ -3257,27 +3298,12 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - - /* - * we have two similar checks here, one based on percentage - * and once based on a hard number of 256MB. The idea - * is that if we have a good amount of free - * room, don't allocate a chunk. A good mount is - * less than 80% utilized of the chunks we have allocated, - * or more than 256MB free - */ - if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) - return 0; - - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) - return 0; - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 5% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + /* 256MB or 2% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3291,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - flags = btrfs_reduce_alloc_profile(extent_root, flags); + BUG_ON(!profile_is_valid(flags, 0)); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { @@ -3416,7 +3442,8 @@ static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); if (reserved > space_info->bytes_may_use) @@ -5294,15 +5321,6 @@ alloc: if (unlikely(block_group->ro)) goto loop; - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5348,8 +5366,15 @@ refill_cluster: * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. */ - if (loop >= LOOP_NO_EMPTY_SIZE) { + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + last_ptr->block_group != block_group) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5360,6 +5385,11 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5400,6 +5430,15 @@ refill_cluster: } unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5437,9 +5476,6 @@ checks: goto loop; } - ins->objectid = search_start; - ins->offset = num_bytes; - if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -6791,6 +6827,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + /* pick restriper's target profile and return */ + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + /* extended -> chunk profile */ + tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + return tgt; + } + } + /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS @@ -7466,6 +7525,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return 0; } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { @@ -7476,6 +7551,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_key key; struct inode *inode; int ret; + int index; int factor; root = root->fs_info->extent_root; @@ -7491,6 +7567,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, block_group); memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -7565,6 +7642,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) + clear_avail_alloc_bits(root->fs_info, block_group->flags); up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cc7492c823f3..97fbe939c050 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1836,7 +1838,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) switch (origin) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek_unlocked(file, offset, origin); + offset = generic_file_llseek(file, offset, origin); goto out; case SEEK_DATA: case SEEK_HOLE: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b3cbb8939fa3..6c7887a7770c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2289,23 +2289,23 @@ out: static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; - bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(bytes, block_group->sectorsize); - total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, block_group->sectorsize); + min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2314,7 +2314,7 @@ again: i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -2324,10 +2324,9 @@ again: if (!found_bits) return -ENOSPC; - if (!found) { + if (!total_found) { start = i; cluster->max_size = 0; - found = true; } total_found += found_bits; @@ -2335,13 +2334,8 @@ again: if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } @@ -2357,23 +2351,23 @@ again: /* * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 max_gap = 128 * 1024; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2383,8 +2377,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap) { - if (list_empty(&entry->list)) + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2397,12 +2391,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; - prev = entry; - while (window_free <= min_bytes) { - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2411,26 +2402,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (entry->offset - (prev->offset + prev->bytes) > max_gap || - entry->offset - window_start > (min_bytes * 2)) { - first = entry; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) max_extent = entry->bytes; - } else { - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - prev = entry; } + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + cluster->window_start = first->offset; node = &first->offset_index; @@ -2444,7 +2427,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap) + if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); @@ -2466,7 +2449,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2491,7 +2474,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (entry->bytes < min_bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); + bytes, cont1_bytes, min_bytes); if (!ret) return 0; } @@ -2505,7 +2488,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2521,23 +2504,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } spin_lock(&ctl->tree_lock); @@ -2545,7 +2529,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < min_bytes) { + if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2559,10 +2543,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, } ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes, min_bytes); + bytes + empty_size, + cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes, min_bytes); + offset, bytes + empty_size, + cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13b0542015ff..fd1a06df5bc6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2358,7 +2358,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); inode->i_uid = btrfs_inode_uid(leaf, inode_item); inode->i_gid = btrfs_inode_gid(leaf, inode_item); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); @@ -6698,7 +6698,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fe8a60c865eb..ef909b5d3d2e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1213,13 +1213,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1236,7 +1244,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_unlock; + goto out_free; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1251,7 +1259,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } @@ -1260,7 +1268,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1269,11 +1277,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); @@ -1286,7 +1294,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_unlock; + goto out_free; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1294,9 +1302,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2062,14 +2071,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2084,14 +2104,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -3044,6 +3075,163 @@ out: return ret; } +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } +} + +static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out; + } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } + } else { + bargs = NULL; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + } + +do_balance: + ret = btrfs_balance(bctl, bargs); + /* + * bctl is freed in __cancel_balance or in free_fs_info if + * restriper was paused all the way until unmount + */ + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3088,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3120,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(root, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de8..4f69028a68c4 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 + +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + __u64 usage; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + __u64 unused[8]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 34a8b6112ea4..5a7227fa9380 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,8 +164,9 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +201,7 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, {Opt_err, NULL}, }; @@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); return 0; } @@ -826,13 +833,9 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { - struct super_block *s; struct dentry *root; struct vfsmount *mnt; - struct mnt_namespace *ns_private; char *newargs; - struct path path; - int error; newargs = setup_root_args(data); if (!newargs) @@ -843,39 +846,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (IS_ERR(mnt)) return ERR_CAST(mnt); - ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) { - mntput(mnt); - return ERR_CAST(ns_private); - } - - /* - * This will trigger the automount of the subvol so we can just - * drop the mnt we have here and return the dentry that we - * found. - */ - error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, - LOOKUP_FOLLOW, &path); - put_mnt_ns(ns_private); - if (error) - return ERR_PTR(error); + root = mount_subtree(mnt, subvol_name); - if (!is_subvolume_inode(path.dentry->d_inode)) { - path_put(&path); - mntput(mnt); - error = -EINVAL; + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); - return ERR_PTR(-EINVAL); } - /* Get a ref to the sb and the dentry we found and return it */ - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - root = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); - return root; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3c..360c2dfd1ee6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 4) { + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f4d81c06d48f..3568374d419d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1031,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } btrfs_release_path(path); if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fbb493b28d5a..e0b7bb92a170 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/kthread.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -1280,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1450,7 +1450,6 @@ error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1626,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1754,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1763,7 +1760,7 @@ error: mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2074,6 +2071,362 @@ error: return ret; } +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->profiles & chunk_profile) + return 0; + + return 1; +} + +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor <= 0) + return 0; + if (factor >= 100) + return num; + + num *= factor; + do_div(num, 100); + return num; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + factor = num_stripes / factor; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->target & chunk_profile) + return 1; + + return 0; +} + +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + return 1; +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2083,29 +2436,28 @@ static u64 div_factor(u64 num, int factor) return num; } -int btrfs_balance(struct btrfs_root *dev_root) +static int __btrfs_balance(struct btrfs_fs_info *fs_info) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2134,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root) ret = -ENOMEM; goto error; } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2148,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid) break; @@ -2164,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root) if (found_key.offset == 0) break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); btrfs_release_path(path); + if (!ret) + goto loop; + + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; + if (ret == -ENOSPC) { + enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: key.offset = found_key.offset - 1; } - ret = 0; + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + return ret; } +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + int ret; + + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + BUG_ON(ret); +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; + int ret; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -EINVAL; + goto out; + } + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + printk(KERN_ERR "btrfs: with mixed groups data and " + "metadata balance options must be the same\n"); + ret = -EINVAL; + goto out; + } + } + + /* + * Profile changing sanity checks. Skip them if a simple + * balance is requested. + */ + if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & + BTRFS_BALANCE_ARGS_CONVERT)) + goto do_balance; + + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (fs_info->fs_devices->num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (fs_info->fs_devices->num_devices < 4) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + else + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10); + + if (!profile_is_valid(bctl->data.target, 1) || + bctl->data.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "data profile %llu\n", + (unsigned long long)bctl->data.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->meta.target, 1) || + bctl->meta.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "metadata profile %llu\n", + (unsigned long long)bctl->meta.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->sys.target, 1) || + bctl->sys.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "system profile %llu\n", + (unsigned long long)bctl->sys.target); + ret = -EINVAL; + goto out; + } + + if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } + } + +do_balance: + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + atomic_inc(&fs_info->balance_running); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else + kfree(bctl); + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_balance_control *bctl = + (struct btrfs_balance_control *)data; + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret = 0; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + } else { + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +int btrfs_recover_balance(struct btrfs_root *tree_root) +{ + struct task_struct *tsk; + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + goto out_bctl; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out_bctl; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = tree_root->fs_info; + bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); + if (IS_ERR(tsk)) + ret = PTR_ERR(tsk); + else + goto out; + +out_bctl: + kfree(bctl); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -2437,7 +3158,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 8 * 1024 * 1024; @@ -2748,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); + fs_info->avail_metadata_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -2759,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); + fs_info->avail_system_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, @@ -2937,10 +3660,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c1701ec9d49f..19ac95048b88 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,51 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -228,7 +273,11 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); |